@book{friedman2001elements,
  title={The elements of statistical learning},
  author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert},
  volume={1},
  number={10},
  year={2001},
  publisher={Springer series in statistics New York}
}

@article{hsu2003practical,
  title={A practical guide to support vector classification},
  author={Hsu, Chih-Wei and Chang, Chih-Chung and Lin, Chih-Jen and others},
  year={2003},
  publisher={Taipei}
}

@article{lecun2015deep,
  title={Deep learning},
  author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
  journal={nature},
  volume={521},
  number={7553},
  pages={436},
  year={2015},
  publisher={Nature Publishing Group}
}

@inproceedings{boser1992svm,
  title={A training algorithm for optimal margin classifiers},
  author={Boser, Bernhard E and Guyon, Isabelle M and Vapnik, Vladimir N},
  booktitle={Proceedings of the fifth annual workshop on Computational learning theory},
  pages={144--152},
  year={1992},
  organization={ACM}
}

@article{friedman2003mart,
  title={Multiple additive regression trees with application in epidemiology},
  author={Friedman, Jerome H and Meulman, Jacqueline J},
  journal={Statistics in medicine},
  volume={22},
  number={9},
  pages={1365--1381},
  year={2003},
  publisher={Wiley Online Library}
}

@article{elith2008brt,
  title={A working guide to boosted regression trees},
  author={Elith, Jane and Leathwick, John R and Hastie, Trevor},
  journal={Journal of Animal Ecology},
  volume={77},
  number={4},
  pages={802--813},
  year={2008},
  publisher={Wiley Online Library}
}

@article{friedman2001gbm,
  title={Greedy function approximation: a gradient boosting machine},
  author={Friedman, Jerome H},
  journal={Annals of statistics},
  pages={1189--1232},
  year={2001},
  publisher={JSTOR}
}

@inproceedings{chen2016xgboost,
  title={Xgboost: A scalable tree boosting system},
  author={Chen, Tianqi and Guestrin, Carlos},
  booktitle={Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and data mining},
  pages={785--794},
  year={2016},
  organization={ACM}
}

@article{horvath2013dna,
  title={DNA methylation age of human tissues and cell types},
  author={Horvath, Steve},
  journal={Genome biology},
  volume={14},
  number={10},
  pages={3156},
  year={2013},
  publisher={BioMed Central}
}

@article{numata2012dna,
  title={DNA methylation signatures in development and aging of the human prefrontal cortex},
  author={Numata, Shusuke and Ye, Tianzhang and Hyde, Thomas M and Guitart-Navarro, Xavier and Tao, Ran and Wininger, Michael and Colantuoni, Carlo and Weinberger, Daniel R and Kleinman, Joel E and Lipska, Barbara K},
  journal={The American Journal of Human Genetics},
  volume={90},
  number={2},
  pages={260--272},
  year={2012},
  publisher={Elsevier}
}

@article{zou2005regularization,
  title={Regularization and variable selection via the elastic net},
  author={Zou, Hui and Hastie, Trevor},
  journal={Journal of the royal statistical society: series B (statistical methodology)},
  volume={67},
  number={2},
  pages={301--320},
  year={2005},
  publisher={Wiley Online Library}
}

@article{friedman2010regularization,
  title={Regularization paths for generalized linear models via coordinate descent},
  author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob},
  journal={Journal of statistical software},
  volume={33},
  number={1},
  pages={1},
  year={2010},
  publisher={NIH Public Access}
}

@article{tibshirani1996regression,
  title={Regression shrinkage and selection via the lasso},
  author={Tibshirani, Robert},
  journal={Journal of the Royal Statistical Society: Series B (Methodological)},
  volume={58},
  number={1},
  pages={267--288},
  year={1996},
  publisher={Wiley Online Library}
}

@article{hoerl1970ridge,
  title={Ridge regression: Biased estimation for nonorthogonal problems},
  author={Hoerl, Arthur E and Kennard, Robert W},
  journal={Technometrics},
  volume={12},
  number={1},
  pages={55--67},
  year={1970},
  publisher={Taylor \& Francis Group}
}

@article{breiman2001random,
  title={Random forests},
  author={Breiman, Leo},
  journal={Machine learning},
  volume={45},
  number={1},
  pages={5--32},
  year={2001},
  publisher={Springer}
}

@article{smote,
  title={SMOTE: synthetic minority over-sampling technique},
  author={Chawla, Nitesh V and Bowyer, Kevin W and Hall, Lawrence O and Kegelmeyer, W Philip},
  journal={Journal of artificial intelligence research},
  volume={16},
  pages={321--357},
  year={2002}
}
@Article{enhancerImbalance,
   Author="Libbrecht, M. W.  and Noble, W. S. ",
   Title="{{M}achine learning applications in genetics and genomics}",
   Journal="Nat. Rev. Genet.",
   Year="2015",
   Volume="16",
   Number="6",
   Pages="321--332",
   Month="Jun"
}

@article{mcr,
  title={All Models are Wrong but many are Useful: Variable Importance for Black-Box, Proprietary, or Misspecified Prediction Models, using Model Class Reliance},
  author={Fisher, Aaron and Rudin, Cynthia and Dominici, Francesca},
  journal={arXiv preprint arXiv:1801.01489},
  year={2018}
}
@Article{dalex,
  title = {DALEX: Explainers for Complex Predictive Models in R},
  author = {Przemyslaw Biecek},
  journal = {Journal of Machine Learning Research},
  year = {2018},
  volume = {19},
  pages = {1-5},
  number = {84},
  url = {http://jmlr.org/papers/v19/18-416.html},
}
@Article{pmid20399149,
   Author="Noushmehr, H.  and Weisenberger, D. J.  and Diefes, K.  and Phillips, H. S.  and Pujara, K.  and Berman, B. P.  and Pan, F.  and Pelloski, C. E.  and Sulman, E. P.  and Bhat, K. P.  and Verhaak, R. G.  and Hoadley, K. A.  and Hayes, D. N.  and Perou, C. M.  and Schmidt, H. K.  and Ding, L.  and Wilson, R. K.  and Van Den Berg, D.  and Shen, H.  and Bengtsson, H.  and Neuvial, P.  and Cope, L. M.  and Buckley, J.  and Herman, J. G.  and Baylin, S. B.  and Laird, P. W.  and Aldape, K. ",
   Title="{{I}dentification of a {C}p{G} island methylator phenotype that defines a distinct subgroup of glioma}",
   Journal="Cancer Cell",
   Year="2010",
   Volume="17",
   Number="5",
   Pages="510--522",
   Month="May"
}

@Article{pmid25750696,
   Author="Kourou, K.  and Exarchos, T. P.  and Exarchos, K. P.  and Karamouzis, M. V.  and Fotiadis, D. I. ",
   Title="{{M}achine learning applications in cancer prognosis and prediction}",
   Journal="Comput Struct Biotechnol J",
   Year="2015",
   Volume="13",
   Pages="8--17"
}

@Article{pmid30247488,
   Author="Poplin, R.  and Chang, P. C.  and Alexander, D.  and Schwartz, S.  and Colthurst, T.  and Ku, A.  and Newburger, D.  and Dijamco, J.  and Nguyen, N.  and Afshar, P. T.  and Gross, S. S.  and Dorfman, L.  and McLean, C. Y.  and DePristo, M. A. ",
   Title="{{A} universal {S}{N}{P} and small-indel variant caller using deep neural networks}",
   Journal="Nat. Biotechnol.",
   Year="2018",
   Volume="36",
   Number="10",
   Pages="983--987",
   Month="11"
}

% 26301843 
@Article{pmid26301843,
   Author="Zhou, J.  and Troyanskaya, O. G. ",
   Title="{{P}redicting effects of noncoding variants with deep learning-based sequence model}",
   Journal="Nat. Methods",
   Year="2015",
   Volume="12",
   Number="10",
   Pages="931--934",
   Month="Oct"
}


@Article{pmid21428770,
   Author="Wang, L.  and McLeod, H. L.  and Weinshilboum, R. M. ",
   Title="{{G}enomics and drug response}",
   Journal="N. Engl. J. Med.",
   Year="2011",
   Volume="364",
   Number="12",
   Pages="1144--1153",
   Month="Mar"
}

@Article{pmid22328731,
   Author="Fernandez, M.  and Miranda-Saavedra, D. ",
   Title="{{G}enome-wide enhancer prediction from epigenetic signatures using genetic algorithm-optimized support vector machines}",
   Journal="Nucleic Acids Res.",
   Year="2012",
   Volume="40",
   Number="10",
   Pages="e77",
   Month="May"
}

@Article{pmid22950368,
   Author="Dong, X.  and Greven, M. C.  and Kundaje, A.  and Djebali, S.  and Brown, J. B.  and Cheng, C.  and Gingeras, T. R.  and Gerstein, M.  and Guigo, R.  and Birney, E.  and Weng, Z. ",
   Title="{{M}odeling gene expression using chromatin features in various cellular contexts}",
   Journal="Genome Biol.",
   Year="2012",
   Volume="13",
   Number="9",
   Pages="R53",
   Month="Jun"
}

@Article{pmid12364589,
   Author="Mathe, C.  and Sagot, M. F.  and Schiex, T.  and Rouze, P. ",
   Title="{{C}urrent methods of gene prediction, their strengths and weaknesses}",
   Journal="Nucleic Acids Res.",
   Year="2002",
   Volume="30",
   Number="19",
   Pages="4103--4117",
   Month="Oct"
}

@ARTICLE{Park2014-sr,
  title    = "{MethylSig}: a whole genome {DNA} methylation analysis pipeline",
  author   = "Park, Yongseok and Figueroa, Maria E and Rozek, Laura S and
              Sartor, Maureen A",
  abstract = "MOTIVATION: DNA methylation plays critical roles in gene
              regulation and cellular specification without altering DNA
              sequences. The wide application of reduced representation
              bisulfite sequencing (RRBS) and whole genome bisulfite sequencing
              (bis-seq) opens the door to study DNA methylation at single CpG
              site resolution. One challenging question is how best to test for
              significant methylation differences between groups of biological
              samples in order to minimize false positive findings. RESULTS: We
              present a statistical analysis package, methylSig, to analyse
              genome-wide methylation differences between samples from
              different treatments or disease groups. MethylSig takes into
              account both read coverage and biological variation by utilizing
              a beta-binomial approach across biological samples for a CpG site
              or region, and identifies relevant differences in CpG
              methylation. It can also incorporate local information to improve
              group methylation level and/or variance estimation for
              experiments with small sample size. A permutation study based on
              data from enhanced RRBS samples shows that methylSig maintains a
              well-calibrated type-I error when the number of samples is three
              or more per group. Our simulations show that methylSig has higher
              sensitivity compared with several alternative methods. The use of
              methylSig is illustrated with a comparison of different subtypes
              of acute leukemia and normal bone marrow samples. AVAILABILITY:
              methylSig is available as an R package at
              http://sartorlab.ccmb.med.umich.edu/software. SUPPLEMENTARY
              INFORMATION: Supplementary data are available at Bioinformatics
              online.",
  journal  = "Bioinformatics",
  volume   =  30,
  number   =  17,
  pages    = "2414--2422",
  month    =  sep,
  year     =  2014,
  language = "en"
}

@ARTICLE{Xie2013-cf,
  title    = "Epigenomic analysis of multilineage differentiation of human
              embryonic stem cells",
  author   = "Xie, Wei and Schultz, Matthew D and Lister, Ryan and Hou,
              Zhonggang and Rajagopal, Nisha and Ray, Pradipta and Whitaker,
              John W and Tian, Shulan and Hawkins, R David and Leung, Danny and
              Yang, Hongbo and Wang, Tao and Lee, Ah Young and Swanson, Scott A
              and Zhang, Jiuchun and Zhu, Yun and Kim, Audrey and Nery, Joseph
              R and Urich, Mark A and Kuan, Samantha and Yen, Chia-An and
              Klugman, Sarit and Yu, Pengzhi and Suknuntha, Kran and Propson,
              Nicholas E and Chen, Huaming and Edsall, Lee E and Wagner, Ulrich
              and Li, Yan and Ye, Zhen and Kulkarni, Ashwinikumar and Xuan,
              Zhenyu and Chung, Wen-Yu and Chi, Neil C and Antosiewicz-Bourget,
              Jessica E and Slukvin, Igor and Stewart, Ron and Zhang, Michael Q
              and Wang, Wei and Thomson, James A and Ecker, Joseph R and Ren,
              Bing",
  abstract = "Epigenetic mechanisms have been proposed to play crucial roles in
              mammalian development, but their precise functions are only
              partially understood. To investigate epigenetic regulation of
              embryonic development, we differentiated human embryonic stem
              cells into mesendoderm, neural progenitor cells, trophoblast-like
              cells, and mesenchymal stem cells and systematically
              characterized DNA methylation, chromatin modifications, and the
              transcriptome in each lineage. We found that promoters that are
              active in early developmental stages tend to be CG rich and
              mainly engage H3K27me3 upon silencing in nonexpressing lineages.
              By contrast, promoters for genes expressed preferentially at
              later stages are often CG poor and primarily employ DNA
              methylation upon repression. Interestingly, the early
              developmental regulatory genes are often located in large genomic
              domains that are generally devoid of DNA methylation in most
              lineages, which we termed DNA methylation valleys (DMVs). Our
              results suggest that distinct epigenetic mechanisms regulate
              early and late stages of ES cell differentiation.",
  journal  = "Cell",
  volume   =  153,
  number   =  5,
  pages    = "1134--1148",
  month    =  may,
  year     =  2013,
  language = "en"
}

@ARTICLE{Xie2013-ol,
  title    = "Dynamic trans-acting factor colocalization in human cells",
  author   = "Xie, Dan and Boyle, Alan P and Wu, Linfeng and Zhai, Jie and
              Kawli, Trupti and Snyder, Michael",
  abstract = "Different trans-acting factors (TFs) collaborate and act in
              concert at distinct loci to perform accurate regulation of their
              target genes. To date, the cobinding of TF pairs has been
              investigated in a limited context both in terms of the number of
              factors within a cell type and across cell types and the extent
              of combinatorial colocalizations. Here, we use an approach to
              analyze TF colocalization within a cell type and across multiple
              cell lines at an unprecedented level. We extend this approach
              with large-scale mass spectrometry analysis of
              immunoprecipitations of 50 TFs. Our combined approach reveals
              large numbers of interesting TF-TF associations. We observe
              extensive change in TF colocalizations both within a cell type
              exposed to different conditions and across multiple cell types.
              We show distinct functional annotations and properties of
              different TF cobinding patterns and provide insights into the
              complex regulatory landscape of the cell.",
  journal  = "Cell",
  volume   =  155,
  number   =  3,
  pages    = "713--724",
  month    =  oct,
  year     =  2013,
  language = "en"
}

@ARTICLE{Landan2012-id,
  title   = "Epigenetic polymorphism and the stochastic formation of
             differentially methylated regions in normal and cancerous tissues",
  author  = "Landan, Gilad and Cohen, Netta Mendelson and Mukamel, Zohar and
             Bar, Amir and Molchadsky, Alina and Brosh, Ran and Horn-Saban,
             Shirley and Zalcenstein, Daniela Amann and Goldfinger, Naomi and
             Zundelevich, Adi and Gal-Yam, Einav Nili and Rotter, Varda and
             Tanay, Amos",
  journal = "Nat. Genet.",
  volume  =  44,
  number  =  11,
  pages   = "1207--1214",
  year    =  2012
}

@ARTICLE{Bock2012-zm,
  title    = "Analysing and interpreting {DNA} methylation data",
  author   = "Bock, Christoph",
  abstract = "DNA methylation is an epigenetic mark that has suspected
              regulatory roles in a broad range of biological processes and
              diseases. The technology is now available for studying DNA
              methylation genome-wide, at a high resolution and in a large
              number of samples. This Review discusses relevant concepts,
              computational methods and software tools for analysing and
              interpreting DNA methylation data. It focuses not only on the
              bioinformatic challenges of large epigenome-mapping projects and
              epigenome-wide association studies but also highlights software
              tools that make genome-wide DNA methylation mapping more
              accessible for laboratories with limited bioinformatics
              experience.",
  journal  = "Nat. Rev. Genet.",
  volume   =  13,
  number   =  10,
  pages    = "705--719",
  month    =  oct,
  year     =  2012,
  language = "en"
}

@ARTICLE{Akalin2015-yk,
  title    = "genomation: a toolkit to summarize, annotate and visualize
              genomic intervals",
  author   = "Akalin, Altuna and Franke, Vedran and Vlahovi{\v c}ek, Kristian
              and Mason, Christopher E and Sch{\"u}beler, Dirk",
  abstract = "UNLABELLED: Biological insights can be obtained through
              computational integration of genomics data sets consisting of
              diverse types of information. The integration is often hampered
              by a large variety of existing file formats, often containing
              similar information, and the necessity to use complicated tools
              to achieve the desired results. We have built an R package,
              genomation, to expedite the extraction of biological information
              from high throughput data. The package works with a variety of
              genomic interval file types and enables easy summarization and
              annotation of high throughput data sets with given genomic
              annotations. AVAILABILITY AND IMPLEMENTATION: The software is
              currently distributed under MIT artistic license and freely
              available at http://bioinformatics.mdc-berlin.de/genomation, and
              through the Bioconductor framework. CONTACT:
              dirk.schubeler@fmi.ch, chm2042@med.cornell.edu,
              altuna.akalin@fmi.ch, or aakalin@gmail.com.",
  journal  = "Bioinformatics",
  volume   =  31,
  number   =  7,
  pages    = "1127--1129",
  month    =  apr,
  year     =  2015
}

@MISC{Wreczycka2017-yt,
  title  = "{HOT} or not: Examining the basis of high-occupancy target regions",
  author = "Wreczycka, Katarzyna and Franke, Vedran and Uyar, Bora and Wurmus,
            Ricardo and Akalin, Altuna",
  year   =  2017
}

@ARTICLE{Akalin2012-ve,
  title    = "Base-pair resolution {DNA} methylation sequencing reveals
              profoundly divergent epigenetic landscapes in acute myeloid
              leukemia",
  author   = "Akalin, Altuna and Garrett-Bakelman, Francine E and Kormaksson,
              Matthias and Busuttil, Jennifer and Zhang, Lu and Khrebtukova,
              Irina and Milne, Thomas A and Huang, Yongsheng and Biswas,
              Debabrata and Hess, Jay L and Allis, C David and Roeder, Robert G
              and Valk, Peter J M and L{\"o}wenberg, Bob and Delwel, Ruud and
              Fernandez, Hugo F and Paietta, Elisabeth and Tallman, Martin S
              and Schroth, Gary P and Mason, Christopher E and Melnick, Ari and
              Figueroa, Maria E",
  abstract = "We have developed an enhanced form of reduced representation
              bisulfite sequencing with extended genomic coverage, which
              resulted in greater capture of DNA methylation information of
              regions lying outside of traditional CpG islands. Applying this
              method to primary human bone marrow specimens from patients with
              Acute Myelogeneous Leukemia (AML), we demonstrated that
              genetically distinct AML subtypes display diametrically opposed
              DNA methylation patterns. As compared to normal controls, we
              observed widespread hypermethylation in IDH mutant AMLs,
              preferentially targeting promoter regions and CpG islands
              neighboring the transcription start sites of genes. In contrast,
              AMLs harboring translocations affecting the MLL gene displayed
              extensive loss of methylation of an almost mutually exclusive set
              of CpGs, which instead affected introns and distal intergenic CpG
              islands and shores. When analyzed in conjunction with gene
              expression profiles, it became apparent that these specific
              patterns of DNA methylation result in differing roles in gene
              expression regulation. However, despite this subtype-specific DNA
              methylation patterning, a much smaller set of CpG sites are
              consistently affected in both AML subtypes. Most CpG sites in
              this common core of aberrantly methylated CpGs were
              hypermethylated in both AML subtypes. Therefore, aberrant DNA
              methylation patterns in AML do not occur in a stereotypical
              manner but rather are highly specific and associated with
              specific driving genetic lesions.",
  journal  = "PLoS Genet.",
  volume   =  8,
  number   =  6,
  pages    = "e1002781",
  month    =  jun,
  year     =  2012
}

@ARTICLE{Saito2014-ij,
  title    = "Bisulfighter: accurate detection of methylated cytosines and
              differentially methylated regions",
  author   = "Saito, Yutaka and Tsuji, Junko and Mituyama, Toutai",
  abstract = "Analysis of bisulfite sequencing data usually requires two tasks:
              to call methylated cytosines (mCs) in a sample, and to detect
              differentially methylated regions (DMRs) between paired samples.
              Although numerous tools have been proposed for mC calling,
              methods for DMR detection have been largely limited. Here, we
              present Bisulfighter, a new software package for detecting mCs
              and DMRs from bisulfite sequencing data. Bisulfighter combines
              the LAST alignment tool for mC calling, and a novel framework for
              DMR detection based on hidden Markov models (HMMs). Unlike
              previous attempts that depend on empirical parameters,
              Bisulfighter can use the expectation-maximization algorithm for
              HMMs to adjust parameters for each data set. We conduct extensive
              experiments in which accuracy of mC calling and DMR detection is
              evaluated on simulated data with various mC contexts, read
              qualities, sequencing depths and DMR lengths, as well as on real
              data from a wide range of biological processes. We demonstrate
              that Bisulfighter consistently achieves better accuracy than
              other published tools, providing greater sensitivity for mCs with
              fewer false positives, more precise estimates of mC levels, more
              exact locations of DMRs and better agreement of DMRs with gene
              expression and DNase I hypersensitivity. The source code is
              available at http://epigenome.cbrc.jp/bisulfighter.",
  journal  = "Nucleic Acids Res.",
  volume   =  42,
  number   =  6,
  pages    = "e45",
  month    =  apr,
  year     =  2014,
  language = "en"
}

@ARTICLE{Frith2012-ne,
  title    = "A mostly traditional approach improves alignment of
              bisulfite-converted {DNA}",
  author   = "Frith, Martin C and Mori, Ryota and Asai, Kiyoshi",
  abstract = "Cytosines in genomic DNA are sometimes methylated. This affects
              many biological processes and diseases. The standard way of
              measuring methylation is to use bisulfite, which converts
              unmethylated cytosines to thymines, then sequence the DNA and
              compare it to a reference genome sequence. We describe a method
              for the critical step of aligning the DNA reads to the correct
              genomic locations. Our method builds on classic alignment
              techniques, including likelihood-ratio scores and spaced seeds.
              In a realistic benchmark, our method has a better combination of
              sensitivity, specificity and speed than nine other
              high-throughput bisulfite aligners. This study enables more
              accurate and rational analysis of DNA methylation. It also
              illustrates how to adapt general-purpose alignment methods to a
              special case with distorted base patterns: this should be
              informative for other special cases such as ancient DNA and
              AT-rich genomes.",
  journal  = "Nucleic Acids Res.",
  volume   =  40,
  number   =  13,
  pages    = "e100",
  month    =  jul,
  year     =  2012,
  language = "en"
}

@ARTICLE{Hovestadt2014-kd,
  title    = "Decoding the regulatory landscape of medulloblastoma using {DNA}
              methylation sequencing",
  author   = "Hovestadt, Volker and Jones, David T W and Picelli, Simone and
              Wang, Wei and Kool, Marcel and Northcott, Paul A and Sultan, Marc
              and Stachurski, Katharina and Ryzhova, Marina and Warnatz,
              Hans-J{\"o}rg and Ralser, Meryem and Brun, Sonja and Bunt, Jens
              and J{\"a}ger, Natalie and Kleinheinz, Kortine and Erkek, Serap
              and Weber, Ursula D and Bartholomae, Cynthia C and von Kalle,
              Christof and Lawerenz, Chris and Eils, J{\"u}rgen and Koster, Jan
              and Versteeg, Rogier and Milde, Till and Witt, Olaf and Schmidt,
              Sabine and Wolf, Stephan and Pietsch, Torsten and Rutkowski,
              Stefan and Scheurlen, Wolfram and Taylor, Michael D and Brors,
              Benedikt and Felsberg, J{\"o}rg and Reifenberger, Guido and
              Borkhardt, Arndt and Lehrach, Hans and Wechsler-Reya, Robert J
              and Eils, Roland and Yaspo, Marie-Laure and Landgraf, Pablo and
              Korshunov, Andrey and Zapatka, Marc and Radlwimmer, Bernhard and
              Pfister, Stefan M and Lichter, Peter",
  abstract = "Epigenetic alterations, that is, disruption of DNA methylation
              and chromatin architecture, are now acknowledged as a universal
              feature of tumorigenesis. Medulloblastoma, a clinically
              challenging, malignant childhood brain tumour, is no exception.
              Despite much progress from recent genomics studies, with
              recurrent changes identified in each of the four distinct tumour
              subgroups (WNT-pathway-activated, SHH-pathway-activated, and the
              less-well-characterized Group 3 and Group 4), many cases still
              lack an obvious genetic driver. Here we present whole-genome
              bisulphite-sequencing data from thirty-four human and five murine
              tumours plus eight human and three murine normal controls,
              augmented with matched whole-genome, RNA and chromatin
              immunoprecipitation sequencing data. This comprehensive data set
              allowed us to decipher several features underlying the interplay
              between the genome, epigenome and transcriptome, and its effects
              on medulloblastoma pathophysiology. Most notable were highly
              prevalent regions of hypomethylation correlating with increased
              gene expression, extending tens of kilobases downstream of
              transcription start sites. Focal regions of low methylation
              linked to transcription-factor-binding sites shed light on
              differential transcriptional networks between subgroups, whereas
              increased methylation due to re-normalization of repressed
              chromatin in DNA methylation valleys was positively correlated
              with gene expression. Large, partially methylated domains
              affecting up to one-third of the genome showed increased mutation
              rates and gene silencing in a subgroup-specific fashion.
              Epigenetic alterations also affected novel medulloblastoma
              candidate genes (for example, LIN28B), resulting in alternative
              promoter usage and/or differential messenger RNA/microRNA
              expression. Analysis of mouse medulloblastoma and precursor-cell
              methylation demonstrated a somatic origin for many alterations.
              Our data provide insights into the epigenetic regulation of
              transcription and genome organization in medulloblastoma
              pathogenesis, which are probably also of importance in a wider
              developmental and disease context.",
  journal  = "Nature",
  volume   =  510,
  number   =  7506,
  pages    = "537--541",
  month    =  jun,
  year     =  2014,
  language = "en"
}

@ARTICLE{Stirzaker2014-ao,
  title    = "Mining cancer methylomes: prospects and challenges",
  author   = "Stirzaker, Clare and Taberlay, Phillippa C and Statham, Aaron L
              and Clark, Susan J",
  abstract = "There are over 28 million CpG sites in the human genome.
              Assessing the methylation status of each of these sites will be
              required to understand fully the role of DNA methylation in
              health and disease. Genome-wide analysis, using arrays and
              high-throughput sequencing, has enabled assessment of large
              fractions of the methylome, but each protocol comes with unique
              advantages and disadvantages. Notably, except for whole-genome
              bisulfite sequencing, most commonly used genome-wide methods
              detect <5\% of all CpG sites. Here, we discuss approaches for
              methylome studies and compare genome coverage of promoters,
              genes, and intergenic regions, and capacity to quantitate
              individual CpG methylation states. Finally, we examine the extent
              of published cancer methylomes that have been generated using
              genome-wide approaches.",
  journal  = "Trends Genet.",
  volume   =  30,
  number   =  2,
  pages    = "75--84",
  month    =  feb,
  year     =  2014,
  keywords = "DNA methylation; cancer methylome; epigenetics",
  language = "en"
}

@INCOLLECTION{Baubec2016-pt,
  title     = "{Genome-Wide} Analysis of {DNA} Methylation Patterns by
               {High-Throughput} Sequencing",
  booktitle = "Field Guidelines for Genetic Experimental Designs in
               {High-Throughput} Sequencing",
  author    = "Baubec, Tuncay and Akalin, Altuna",
  pages     = "197--221",
  year      =  2016
}

@ARTICLE{Wang2015-of,
  title    = "{swDMR}: A Sliding Window Approach to Identify Differentially
              Methylated Regions Based on Whole Genome Bisulfite Sequencing",
  author   = "Wang, Zhen and Li, Xianfeng and Jiang, Yi and Shao, Qianzhi and
              Liu, Qi and Chen, Bingyu and Huang, Dongsheng",
  abstract = "DNA methylation is a widespread epigenetic modification that
              plays an essential role in gene expression through
              transcriptional regulation and chromatin remodeling. The
              emergence of whole genome bisulfite sequencing (WGBS) represents
              an important milestone in the detection of DNA methylation.
              Characterization of differential methylated regions (DMRs) is
              fundamental as well for further functional analysis. In this
              study, we present swDMR (http://sourceforge.net/projects/swDMR/)
              for the comprehensive analysis of DMRs from whole genome
              methylation profiles by a sliding window approach. It is an
              integrated tool designed for WGBS data, which not only implements
              accessible statistical methods to perform hypothesis test adapted
              to two or more samples without replicates, but false discovery
              rate was also controlled by multiple test correction. Downstream
              analysis tools were also provided, including cluster, annotation
              and visualization modules. In summary, based on WGBS data, swDMR
              can produce abundant information of differential methylated
              regions. As a convenient and flexible tool, we believe swDMR will
              bring us closer to unveil the potential functional regions
              involved in epigenetic regulation.",
  journal  = "PLoS One",
  volume   =  10,
  number   =  7,
  pages    = "e0132866",
  month    =  jul,
  year     =  2015,
  language = "en"
}

@ARTICLE{Booth2012-nl,
  title    = "Quantitative sequencing of 5-methylcytosine and
              5-hydroxymethylcytosine at single-base resolution",
  author   = "Booth, Michael J and Branco, Miguel R and Ficz, Gabriella and
              Oxley, David and Krueger, Felix and Reik, Wolf and
              Balasubramanian, Shankar",
  abstract = "5-Methylcytosine can be converted to 5-hydroxymethylcytosine
              (5hmC) in mammalian DNA by the ten-eleven translocation (TET)
              enzymes. We introduce oxidative bisulfite sequencing (oxBS-Seq),
              the first method for quantitative mapping of 5hmC in genomic DNA
              at single-nucleotide resolution. Selective chemical oxidation of
              5hmC to 5-formylcytosine (5fC) enables bisulfite conversion of
              5fC to uracil. We demonstrate the utility of oxBS-Seq to map and
              quantify 5hmC at CpG islands (CGIs) in mouse embryonic stem (ES)
              cells and identify 800 5hmC-containing CGIs that have on average
              3.3\% hydroxymethylation. High levels of 5hmC were found in CGIs
              associated with transcriptional regulators and in long
              interspersed nuclear elements, suggesting that these regions
              might undergo epigenetic reprogramming in ES cells. Our results
              open new questions on 5hmC dynamics and sequence-specific
              targeting by TETs.",
  journal  = "Science",
  volume   =  336,
  number   =  6083,
  pages    = "934--937",
  month    =  may,
  year     =  2012,
  language = "en"
}

@ARTICLE{Harris2012-hz,
  title    = "{BRAT-BW}: efficient and accurate mapping of bisulfite-treated
              reads",
  author   = "Harris, Elena Y and Ponts, Nadia and Le Roch, Karine G and
              Lonardi, Stefano",
  abstract = "SUMMARY: We introduce BRAT-BW, a fast, accurate and
              memory-efficient tool that maps bisulfite-treated short reads
              (BS-seq) to a reference genome using the FM-index
              (Burrows-Wheeler transform). BRAT-BW is significantly more memory
              efficient and faster on longer reads than current
              state-of-the-art tools for BS-seq data, without compromising on
              accuracy. BRAT-BW is a part of a software suite for genome-wide
              single base-resolution methylation data analysis that supports
              single and paired-end reads and includes a tool for estimation of
              methylation level at each cytosine. AVAILABILITY: The software is
              available in the public domain at
              http://compbio.cs.ucr.edu/brat/.",
  journal  = "Bioinformatics",
  volume   =  28,
  number   =  13,
  pages    = "1795--1796",
  month    =  jul,
  year     =  2012,
  language = "en"
}

@ARTICLE{Yu2012-wm,
  title    = "Tet-assisted bisulfite sequencing of 5-hydroxymethylcytosine",
  author   = "Yu, Miao and Hon, Gary C and Szulwach, Keith E and Song,
              Chun-Xiao and Jin, Peng and Ren, Bing and He, Chuan",
  abstract = "A complete understanding of the potential function of
              5-hydroxymethylcytosine (5-hmC), a DNA cytosine modification in
              mammalian cells, requires an accurate single-base resolution
              sequencing method. Here we describe a modified
              bisulfite-sequencing method, Tet-assisted bisulfite sequencing
              (TAB-seq), which can identify 5-hmC at single-base resolution, as
              well as determine its abundance at each modification site. This
              protocol involves $\beta$-glucosyltransferase
              ($\beta$-GT)-mediated protection of 5-hmC (glucosylation) and
              recombinant mouse Tet1(mTet1)-mediated oxidation of
              5-methylcytosine (5-mC) to 5-carboxylcytosine (5-caC). After the
              subsequent bisulfite treatment and PCR amplification, both
              cytosine and 5-caC (derived from 5-mC) are converted to thymine
              (T), whereas 5-hmC reads as C. The treated genomic DNA is
              suitable for both whole-genome and locus-specific sequencing. The
              entire procedure (which does not include data analysis) can be
              completed in 14 d for whole-genome sequencing or 7 d for
              locus-specific sequencing.",
  journal  = "Nat. Protoc.",
  volume   =  7,
  number   =  12,
  pages    = "2159--2170",
  month    =  dec,
  year     =  2012,
  language = "en"
}

@ARTICLE{Zhu2010-ny,
  title    = "{ChIPpeakAnno}: a Bioconductor package to annotate {ChIP-seq} and
              {ChIP-chip} data",
  author   = "Zhu, Lihua J and Gazin, Claude and Lawson, Nathan D and
              Pag{\`e}s, Herv{\'e} and Lin, Simon M and Lapointe, David S and
              Green, Michael R",
  abstract = "BACKGROUND: Chromatin immunoprecipitation (ChIP) followed by
              high-throughput sequencing (ChIP-seq) or ChIP followed by genome
              tiling array analysis (ChIP-chip) have become standard
              technologies for genome-wide identification of DNA-binding
              protein target sites. A number of algorithms have been developed
              in parallel that allow identification of binding sites from
              ChIP-seq or ChIP-chip datasets and subsequent visualization in
              the University of California Santa Cruz (UCSC) Genome Browser as
              custom annotation tracks. However, summarizing these tracks can
              be a daunting task, particularly if there are a large number of
              binding sites or the binding sites are distributed widely across
              the genome. RESULTS: We have developed ChIPpeakAnno as a
              Bioconductor package within the statistical programming
              environment R to facilitate batch annotation of enriched peaks
              identified from ChIP-seq, ChIP-chip, cap analysis of gene
              expression (CAGE) or any experiments resulting in a large number
              of enriched genomic regions. The binding sites annotated with
              ChIPpeakAnno can be viewed easily as a table, a pie chart or
              plotted in histogram form, i.e., the distribution of distances to
              the nearest genes for each set of peaks. In addition, we have
              implemented functionalities for determining the significance of
              overlap between replicates or binding sites among transcription
              factors within a complex, and for drawing Venn diagrams to
              visualize the extent of the overlap between replicates.
              Furthermore, the package includes functionalities to retrieve
              sequences flanking putative binding sites for PCR amplification,
              cloning, or motif discovery, and to identify Gene Ontology (GO)
              terms associated with adjacent genes. CONCLUSIONS: ChIPpeakAnno
              enables batch annotation of the binding sites identified from
              ChIP-seq, ChIP-chip, CAGE or any technology that results in a
              large number of enriched genomic regions within the statistical
              programming environment R. Allowing users to pass their own
              annotation data such as a different Chromatin immunoprecipitation
              (ChIP) preparation and a dataset from literature, or existing
              annotation packages, such as GenomicFeatures and BSgenome,
              provides flexibility. Tight integration to the biomaRt package
              enables up-to-date annotation retrieval from the BioMart
              database.",
  journal  = "BMC Bioinformatics",
  volume   =  11,
  pages    = "237",
  month    =  may,
  year     =  2010,
  language = "en"
}

@ARTICLE{Lister2013-vs,
  title   = "Global Epigenomic Reconfiguration During Mammalian Brain
             Development",
  author  = "Lister, R and Mukamel, E A and Nery, J R and Urich, M and
             Puddifoot, C A and Johnson, N D and Lucero, J and Huang, Y and
             Dwork, A J and Schultz, M D and Yu, M and Tonti-Filippini, J and
             Heyn, H and Hu, S and Wu, J C and Rao, A and Esteller, M and He, C
             and Haghighi, F G and Sejnowski, T J and Behrens, M M and Ecker, J
             R",
  journal = "Science",
  volume  =  341,
  number  =  6146,
  pages   = "1237905--1237905",
  year    =  2013
}

@ARTICLE{Feng2014-pd,
  title    = "A Bayesian hierarchical model to detect differentially methylated
              loci from single nucleotide resolution sequencing data",
  author   = "Feng, Hao and Conneely, Karen N and Wu, Hao",
  abstract = "DNA methylation is an important epigenetic modification that has
              essential roles in cellular processes including gene regulation,
              development and disease and is widely dysregulated in most types
              of cancer. Recent advances in sequencing technology have enabled
              the measurement of DNA methylation at single nucleotide
              resolution through methods such as whole-genome bisulfite
              sequencing and reduced representation bisulfite sequencing. In
              DNA methylation studies, a key task is to identify differences
              under distinct biological contexts, for example, between tumor
              and normal tissue. A challenge in sequencing studies is that the
              number of biological replicates is often limited by the costs of
              sequencing. The small number of replicates leads to unstable
              variance estimation, which can reduce accuracy to detect
              differentially methylated loci (DML). Here we propose a novel
              statistical method to detect DML when comparing two treatment
              groups. The sequencing counts are described by a
              lognormal-beta-binomial hierarchical model, which provides a
              basis for information sharing across different CpG sites. A Wald
              test is developed for hypothesis testing at each CpG site.
              Simulation results show that the proposed method yields improved
              DML detection compared to existing methods, particularly when the
              number of replicates is low. The proposed method is implemented
              in the Bioconductor package DSS.",
  journal  = "Nucleic Acids Res.",
  volume   =  42,
  number   =  8,
  pages    = "e69",
  month    =  apr,
  year     =  2014,
  language = "en"
}

@ARTICLE{Ryan2014-im,
  title    = "Bison: bisulfite alignment on nodes of a cluster",
  author   = "Ryan, Devon Patrick and Ehninger, Dan",
  abstract = "BACKGROUND: DNA methylation changes are associated with a wide
              array of biological processes. Bisulfite conversion of DNA
              followed by high-throughput sequencing is increasingly being used
              to assess genome-wide methylation at single-base resolution. The
              relative slowness of most commonly used aligners for processing
              such data introduces an unnecessarily long delay between receipt
              of raw data and statistical analysis. While this process can be
              sped-up by using computer clusters, current tools are not
              designed with them in mind and end-users must create such
              implementations themselves. RESULTS: Here, we present a novel
              BS-seq aligner, Bison, which exploits multiple nodes of a
              computer cluster to speed up this process and also has increased
              accuracy. Bison is accompanied by a variety of helper programs
              and scripts to ease, as much as possible, the process of quality
              control and preparing results for statistical analysis by a
              variety of popular R packages. Bison is also accompanied by
              bison\_herd, a variant of Bison with the same output but that can
              scale to a semi-arbitrary number of nodes, with concomitant
              increased demands on the underlying message passing interface
              implementation. CONCLUSIONS: Bison is a new bisulfite-converted
              short-read aligner providing end users easier scalability for
              performance gains, more accurate alignments, and a convenient
              pathway for quality controlling alignments and converting
              methylation calls into a form appropriate for statistical
              analysis. Bison and the more scalable bison\_herd are natively
              able to utilize multiple nodes of a computer cluster
              simultaneously and serve to simplify to the process of creating
              analysis pipelines.",
  journal  = "BMC Bioinformatics",
  volume   =  15,
  pages    = "337",
  month    =  oct,
  year     =  2014,
  language = "en"
}

@ARTICLE{Song2013-cr,
  title    = "A reference methylome database and analysis pipeline to
              facilitate integrative and comparative epigenomics",
  author   = "Song, Qiang and Decato, Benjamin and Hong, Elizabeth E and Zhou,
              Meng and Fang, Fang and Qu, Jianghan and Garvin, Tyler and
              Kessler, Michael and Zhou, Jun and Smith, Andrew D",
  abstract = "DNA methylation is implicated in a surprising diversity of
              regulatory, evolutionary processes and diseases in eukaryotes.
              The introduction of whole-genome bisulfite sequencing has enabled
              the study of DNA methylation at a single-base resolution,
              revealing many new aspects of DNA methylation and highlighting
              the usefulness of methylome data in understanding a variety of
              genomic phenomena. As the number of publicly available
              whole-genome bisulfite sequencing studies reaches into the
              hundreds, reliable and convenient tools for comparing and
              analyzing methylomes become increasingly important. We present
              MethPipe, a pipeline for both low and high-level methylome
              analysis, and MethBase, an accompanying database of annotated
              methylomes from the public domain. Together these resources
              enable researchers to extract interesting features from
              methylomes and compare them with those identified in public
              methylomes in our database.",
  journal  = "PLoS One",
  volume   =  8,
  number   =  12,
  pages    = "e81148",
  month    =  dec,
  year     =  2013,
  language = "en"
}

@ARTICLE{Schubeler2015-ai,
  title    = "Function and information content of {DNA} methylation",
  author   = "Sch{\"u}beler, Dirk",
  abstract = "Cytosine methylation is a DNA modification generally associated
              with transcriptional silencing. Factors that regulate methylation
              have been linked to human disease, yet how they contribute to
              malignances remains largely unknown. Genomic maps of DNA
              methylation have revealed unexpected dynamics at gene regulatory
              regions, including active demethylation by TET proteins at
              binding sites for transcription factors. These observations
              indicate that the underlying DNA sequence largely accounts for
              local patterns of methylation. As a result, this mark is highly
              informative when studying gene regulation in normal and diseased
              cells, and it can potentially function as a biomarker. Although
              these findings challenge the view that methylation is generally
              instructive for gene silencing, several open questions remain,
              including how methylation is targeted and recognized and in what
              context it affects genome readout.",
  journal  = "Nature",
  volume   =  517,
  number   =  7534,
  pages    = "321--326",
  month    =  jan,
  year     =  2015,
  language = "en"
}

@ARTICLE{Rampal2014-lw,
  title    = "{DNA} hydroxymethylation profiling reveals that {WT1} mutations
              result in loss of {TET2} function in acute myeloid leukemia",
  author   = "Rampal, Raajit and Alkalin, Altuna and Madzo, Jozef and
              Vasanthakumar, Aparna and Pronier, Elodie and Patel, Jay and Li,
              Yushan and Ahn, Jihae and Abdel-Wahab, Omar and Shih, Alan and
              Lu, Chao and Ward, Patrick S and Tsai, Jennifer J and Hricik,
              Todd and Tosello, Valeria and Tallman, Jacob E and Zhao, Xinyang
              and Daniels, Danette and Dai, Qing and Ciminio, Luisa and
              Aifantis, Iannis and He, Chuan and Fuks, Francois and Tallman,
              Martin S and Ferrando, Adolfo and Nimer, Stephen and Paietta,
              Elisabeth and Thompson, Craig B and Licht, Jonathan D and Mason,
              Christopher E and Godley, Lucy A and Melnick, Ari and Figueroa,
              Maria E and Levine, Ross L",
  abstract = "Somatic mutations in IDH1/IDH2 and TET2 result in impaired
              TET2-mediated conversion of 5-methylcytosine (5mC) to
              5-hydroxymethylcytosine (5hmC). The observation that WT1
              inactivating mutations anticorrelate with TET2/IDH1/IDH2
              mutations in acute myeloid leukemia (AML) led us to hypothesize
              that WT1 mutations may impact TET2 function. WT1 mutant AML
              patients have reduced 5hmC levels similar to TET2/IDH1/IDH2
              mutant AML. These mutations are characterized by convergent,
              site-specific alterations in DNA hydroxymethylation, which drive
              differential gene expression more than alterations in DNA
              promoter methylation. WT1 overexpression increases global levels
              of 5hmC, and WT1 silencing reduced 5hmC levels. WT1 physically
              interacts with TET2 and TET3, and WT1 loss of function results in
              a similar hematopoietic differentiation phenotype as observed
              with TET2 deficiency. These data provide a role for WT1 in
              regulating DNA hydroxymethylation and suggest that TET2 IDH1/IDH2
              and WT1 mutations define an AML subtype defined by dysregulated
              DNA hydroxymethylation.",
  journal  = "Cell Rep.",
  volume   =  9,
  number   =  5,
  pages    = "1841--1855",
  month    =  dec,
  year     =  2014
}

@ARTICLE{Lu2013-dy,
  title    = "Chemical modification-assisted bisulfite sequencing ({CAB-Seq})
              for 5-carboxylcytosine detection in {DNA}",
  author   = "Lu, Xingyu and Song, Chun-Xiao and Szulwach, Keith and Wang,
              Zhipeng and Weidenbacher, Payton and Jin, Peng and He, Chuan",
  abstract = "5-Methylcytosine (5mC) in DNA can be oxidized stepwise to
              5-hydroxymethylcytosine (5hmC), 5-formylcytosine (5fC), and
              5-carboxylcytosine (5caC) by the TET family proteins. Thymine DNA
              glycosylase can further remove 5fC and 5caC, connecting 5mC
              oxidation with active DNA demethylation. Here, we present a
              chemical modification-assisted bisulfite sequencing (CAB-Seq)
              that can detect 5caC with single-base resolution in DNA. We
              optimized 1-ethyl-3-[3-dimethylaminopropyl]carbodiimide
              hydrochloride (EDC)-catalyzed amide bond formation between the
              carboxyl group of 5caC and a primary amine group. We found that
              the modified 5caC can survive the bisulfite treatment without
              deamination. Therefore, this chemical labeling coupled with
              bisulfite treatment provides a base-resolution detection and
              sequencing method for 5caC.",
  journal  = "J. Am. Chem. Soc.",
  volume   =  135,
  number   =  25,
  pages    = "9315--9317",
  month    =  jun,
  year     =  2013,
  language = "en"
}

@ARTICLE{Hebestreit2013-jo,
  title   = "Detection of significantly differentially methylated regions in
             targeted bisulfite sequencing data",
  author  = "Hebestreit, K and Dugas, M and Klein, H-U",
  journal = "Bioinformatics",
  volume  =  29,
  number  =  13,
  pages   = "1647--1653",
  year    =  2013
}

@ARTICLE{Tahiliani2009-ar,
  title    = "Conversion of 5-methylcytosine to 5-hydroxymethylcytosine in
              mammalian {DNA} by {MLL} partner {TET1}",
  author   = "Tahiliani, Mamta and Koh, Kian Peng and Shen, Yinghua and Pastor,
              William A and Bandukwala, Hozefa and Brudno, Yevgeny and Agarwal,
              Suneet and Iyer, Lakshminarayan M and Liu, David R and Aravind, L
              and Rao, Anjana",
  abstract = "DNA cytosine methylation is crucial for retrotransposon silencing
              and mammalian development. In a computational search for enzymes
              that could modify 5-methylcytosine (5mC), we identified TET
              proteins as mammalian homologs of the trypanosome proteins JBP1
              and JBP2, which have been proposed to oxidize the 5-methyl group
              of thymine. We show here that TET1, a fusion partner of the MLL
              gene in acute myeloid leukemia, is a 2-oxoglutarate (2OG)- and
              Fe(II)-dependent enzyme that catalyzes conversion of 5mC to
              5-hydroxymethylcytosine (hmC) in cultured cells and in vitro. hmC
              is present in the genome of mouse embryonic stem cells, and hmC
              levels decrease upon RNA interference-mediated depletion of TET1.
              Thus, TET proteins have potential roles in epigenetic regulation
              through modification of 5mC to hmC.",
  journal  = "Science",
  volume   =  324,
  number   =  5929,
  pages    = "930--935",
  month    =  may,
  year     =  2009,
  language = "en"
}

@ARTICLE{Weber2005-rd,
  title    = "Chromosome-wide and promoter-specific analyses identify sites of
              differential {DNA} methylation in normal and transformed human
              cells",
  author   = "Weber, Michael and Davies, Jonathan J and Wittig, David and
              Oakeley, Edward J and Haase, Michael and Lam, Wan L and
              Sch{\"u}beler, Dirk",
  abstract = "Cytosine methylation is required for mammalian development and is
              often perturbed in human cancer. To determine how this epigenetic
              modification is distributed in the genomes of primary and
              transformed cells, we used an immunocapturing approach followed
              by DNA microarray analysis to generate methylation profiles of
              all human chromosomes at 80-kb resolution and for a large set of
              CpG islands. In primary cells we identified broad genomic regions
              of differential methylation with higher levels in gene-rich
              neighborhoods. Female and male cells had indistinguishable
              profiles for autosomes but differences on the X chromosome. The
              inactive X chromosome (Xi) was hypermethylated at only a subset
              of gene-rich regions and, unexpectedly, overall hypomethylated
              relative to its active counterpart. The chromosomal methylation
              profile of transformed cells was similar to that of primary
              cells. Nevertheless, we detected large genomic segments with
              hypomethylation in the transformed cell residing in gene-poor
              areas. Furthermore, analysis of 6,000 CpG islands showed that
              only a small set of promoters was methylated differentially,
              suggesting that aberrant methylation of CpG island promoters in
              malignancy might be less frequent than previously hypothesized.",
  journal  = "Nat. Genet.",
  volume   =  37,
  number   =  8,
  pages    = "853--862",
  month    =  aug,
  year     =  2005,
  language = "en"
}

@ARTICLE{International_Cancer_Genome_Consortium2010-fu,
  title    = "International network of cancer genome projects",
  author   = "{International Cancer Genome Consortium} and Hudson, Thomas J and
              Anderson, Warwick and Artez, Axel and Barker, Anna D and Bell,
              Cindy and Bernab{\'e}, Rosa R and Bhan, M K and Calvo, Fabien and
              Eerola, Iiro and Gerhard, Daniela S and Guttmacher, Alan and
              Guyer, Mark and Hemsley, Fiona M and Jennings, Jennifer L and
              Kerr, David and Klatt, Peter and Kolar, Patrik and Kusada, Jun
              and Lane, David P and Laplace, Frank and Youyong, Lu and
              Nettekoven, Gerd and Ozenberger, Brad and Peterson, Jane and Rao,
              T S and Remacle, Jacques and Schafer, Alan J and Shibata,
              Tatsuhiro and Stratton, Michael R and Vockley, Joseph G and
              Watanabe, Koichi and Yang, Huanming and Yuen, Matthew M F and
              Knoppers, Bartha M and Bobrow, Martin and Cambon-Thomsen, Anne
              and Dressler, Lynn G and Dyke, Stephanie O M and Joly, Yann and
              Kato, Kazuto and Kennedy, Karen L and Nicol{\'a}s, Pilar and
              Parker, Michael J and Rial-Sebbag, Emmanuelle and Romeo-Casabona,
              Carlos M and Shaw, Kenna M and Wallace, Susan and Wiesner,
              Georgia L and Zeps, Nikolajs and Lichter, Peter and Biankin,
              Andrew V and Chabannon, Christian and Chin, Lynda and
              Cl{\'e}ment, Bruno and de Alava, Enrique and Degos, Fran{\c
              c}oise and Ferguson, Martin L and Geary, Peter and Hayes, D Neil
              and Hudson, Thomas J and Johns, Amber L and Kasprzyk, Arek and
              Nakagawa, Hidewaki and Penny, Robert and Piris, Miguel A and
              Sarin, Rajiv and Scarpa, Aldo and Shibata, Tatsuhiro and van de
              Vijver, Marc and Futreal, P Andrew and Aburatani, Hiroyuki and
              Bay{\'e}s, M{\'o}nica and Botwell, David D L and Campbell, Peter
              J and Estivill, Xavier and Gerhard, Daniela S and Grimmond, Sean
              M and Gut, Ivo and Hirst, Martin and L{\'o}pez-Ot{\'\i}n, Carlos
              and Majumder, Partha and Marra, Marco and McPherson, John D and
              Nakagawa, Hidewaki and Ning, Zemin and Puente, Xose S and Ruan,
              Yijun and Shibata, Tatsuhiro and Stratton, Michael R and
              Stunnenberg, Hendrik G and Swerdlow, Harold and Velculescu,
              Victor E and Wilson, Richard K and Xue, Hong H and Yang, Liu and
              Spellman, Paul T and Bader, Gary D and Boutros, Paul C and
              Campbell, Peter J and Flicek, Paul and Getz, Gad and Guig{\'o},
              Roderic and Guo, Guangwu and Haussler, David and Heath, Simon and
              Hubbard, Tim J and Jiang, Tao and Jones, Steven M and Li, Qibin
              and L{\'o}pez-Bigas, Nuria and Luo, Ruibang and Muthuswamy,
              Lakshmi and Ouellette, B F Francis and Pearson, John V and
              Puente, Xose S and Quesada, Victor and Raphael, Benjamin J and
              Sander, Chris and Shibata, Tatsuhiro and Speed, Terence P and
              Stein, Lincoln D and Stuart, Joshua M and Teague, Jon W and
              Totoki, Yasushi and Tsunoda, Tatsuhiko and Valencia, Alfonso and
              Wheeler, David A and Wu, Honglong and Zhao, Shancen and Zhou,
              Guangyu and Stein, Lincoln D and Guig{\'o}, Roderic and Hubbard,
              Tim J and Joly, Yann and Jones, Steven M and Kasprzyk, Arek and
              Lathrop, Mark and L{\'o}pez-Bigas, Nuria and Ouellette, B F
              Francis and Spellman, Paul T and Teague, Jon W and Thomas, Gilles
              and Valencia, Alfonso and Yoshida, Teruhiko and Kennedy, Karen L
              and Axton, Myles and Dyke, Stephanie O M and Futreal, P Andrew
              and Gerhard, Daniela S and Gunter, Chris and Guyer, Mark and
              Hudson, Thomas J and McPherson, John D and Miller, Linda J and
              Ozenberger, Brad and Shaw, Kenna M and Kasprzyk, Arek and Stein,
              Lincoln D and Zhang, Junjun and Haider, Syed A and Wang, Jianxin
              and Yung, Christina K and Cros, Anthony and Cross, Anthony and
              Liang, Yong and Gnaneshan, Saravanamuttu and Guberman, Jonathan
              and Hsu, Jack and Bobrow, Martin and Chalmers, Don R C and Hasel,
              Karl W and Joly, Yann and Kaan, Terry S H and Kennedy, Karen L
              and Knoppers, Bartha M and Lowrance, William W and Masui, Tohru
              and Nicol{\'a}s, Pilar and Rial-Sebbag, Emmanuelle and Rodriguez,
              Laura Lyman and Vergely, Catherine and Yoshida, Teruhiko and
              Grimmond, Sean M and Biankin, Andrew V and Bowtell, David D L and
              Cloonan, Nicole and deFazio, Anna and Eshleman, James R and
              Etemadmoghadam, Dariush and Gardiner, Brooke B and Gardiner,
              Brooke A and Kench, James G and Scarpa, Aldo and Sutherland,
              Robert L and Tempero, Margaret A and Waddell, Nicola J and
              Wilson, Peter J and McPherson, John D and Gallinger, Steve and
              Tsao, Ming-Sound and Shaw, Patricia A and Petersen, Gloria M and
              Mukhopadhyay, Debabrata and Chin, Lynda and DePinho, Ronald A and
              Thayer, Sarah and Muthuswamy, Lakshmi and Shazand, Kamran and
              Beck, Timothy and Sam, Michelle and Timms, Lee and Ballin,
              Vanessa and Lu, Youyong and Ji, Jiafu and Zhang, Xiuqing and
              Chen, Feng and Hu, Xueda and Zhou, Guangyu and Yang, Qi and Tian,
              Geng and Zhang, Lianhai and Xing, Xiaofang and Li, Xianghong and
              Zhu, Zhenggang and Yu, Yingyan and Yu, Jun and Yang, Huanming and
              Lathrop, Mark and Tost, J{\"o}rg and Brennan, Paul and Holcatova,
              Ivana and Zaridze, David and Brazma, Alvis and Egevard, Lars and
              Prokhortchouk, Egor and Banks, Rosamonde Elizabeth and Uhl{\'e}n,
              Mathias and Cambon-Thomsen, Anne and Viksna, Juris and Ponten,
              Fredrik and Skryabin, Konstantin and Stratton, Michael R and
              Futreal, P Andrew and Birney, Ewan and Borg, Ake and
              B{\o}rresen-Dale, Anne-Lise and Caldas, Carlos and Foekens, John
              A and Martin, Sancha and Reis-Filho, Jorge S and Richardson,
              Andrea L and Sotiriou, Christos and Stunnenberg, Hendrik G and
              Thoms, Giles and van de Vijver, Marc and van't Veer, Laura and
              Calvo, Fabien and Birnbaum, Daniel and Blanche, H{\'e}l{\`e}ne
              and Boucher, Pascal and Boyault, Sandrine and Chabannon,
              Christian and Gut, Ivo and Masson-Jacquemier, Jocelyne D and
              Lathrop, Mark and Pauport{\'e}, Iris and Pivot, Xavier and
              Vincent-Salomon, Anne and Tabone, Eric and Theillet, Charles and
              Thomas, Gilles and Tost, J{\"o}rg and Treilleux, Isabelle and
              Calvo, Fabien and Bioulac-Sage, Paulette and Cl{\'e}ment, Bruno
              and Decaens, Thomas and Degos, Fran{\c c}oise and Franco,
              Dominique and Gut, Ivo and Gut, Marta and Heath, Simon and
              Lathrop, Mark and Samuel, Didier and Thomas, Gilles and
              Zucman-Rossi, Jessica and Lichter, Peter and Eils, Roland and
              Brors, Benedikt and Korbel, Jan O and Korshunov, Andrey and
              Landgraf, Pablo and Lehrach, Hans and Pfister, Stefan and
              Radlwimmer, Bernhard and Reifenberger, Guido and Taylor, Michael
              D and von Kalle, Christof and Majumder, Partha P and Sarin, Rajiv
              and Rao, T S and Bhan, M K and Scarpa, Aldo and Pederzoli, Paolo
              and Lawlor, Rita A and Delledonne, Massimo and Bardelli, Alberto
              and Biankin, Andrew V and Grimmond, Sean M and Gress, Thomas and
              Klimstra, David and Zamboni, Giuseppe and Shibata, Tatsuhiro and
              Nakamura, Yusuke and Nakagawa, Hidewaki and Kusada, Jun and
              Tsunoda, Tatsuhiko and Miyano, Satoru and Aburatani, Hiroyuki and
              Kato, Kazuto and Fujimoto, Akihiro and Yoshida, Teruhiko and
              Campo, Elias and L{\'o}pez-Ot{\'\i}n, Carlos and Estivill, Xavier
              and Guig{\'o}, Roderic and de Sanjos{\'e}, Silvia and Piris,
              Miguel A and Montserrat, Emili and Gonz{\'a}lez-D{\'\i}az, Marcos
              and Puente, Xose S and Jares, Pedro and Valencia, Alfonso and
              Himmelbauer, Heinz and Himmelbaue, Heinz and Quesada, Victor and
              Bea, Silvia and Stratton, Michael R and Futreal, P Andrew and
              Campbell, Peter J and Vincent-Salomon, Anne and Richardson,
              Andrea L and Reis-Filho, Jorge S and van de Vijver, Marc and
              Thomas, Gilles and Masson-Jacquemier, Jocelyne D and Aparicio,
              Samuel and Borg, Ake and B{\o}rresen-Dale, Anne-Lise and Caldas,
              Carlos and Foekens, John A and Stunnenberg, Hendrik G and van't
              Veer, Laura and Easton, Douglas F and Spellman, Paul T and
              Martin, Sancha and Barker, Anna D and Chin, Lynda and Collins,
              Francis S and Compton, Carolyn C and Ferguson, Martin L and
              Gerhard, Daniela S and Getz, Gad and Gunter, Chris and
              Guttmacher, Alan and Guyer, Mark and Hayes, D Neil and Lander,
              Eric S and Ozenberger, Brad and Penny, Robert and Peterson, Jane
              and Sander, Chris and Shaw, Kenna M and Speed, Terence P and
              Spellman, Paul T and Vockley, Joseph G and Wheeler, David A and
              Wilson, Richard K and Hudson, Thomas J and Chin, Lynda and
              Knoppers, Bartha M and Lander, Eric S and Lichter, Peter and
              Stein, Lincoln D and Stratton, Michael R and Anderson, Warwick
              and Barker, Anna D and Bell, Cindy and Bobrow, Martin and Burke,
              Wylie and Collins, Francis S and Compton, Carolyn C and DePinho,
              Ronald A and Easton, Douglas F and Futreal, P Andrew and Gerhard,
              Daniela S and Green, Anthony R and Guyer, Mark and Hamilton,
              Stanley R and Hubbard, Tim J and Kallioniemi, Olli P and Kennedy,
              Karen L and Ley, Timothy J and Liu, Edison T and Lu, Youyong and
              Majumder, Partha and Marra, Marco and Ozenberger, Brad and
              Peterson, Jane and Schafer, Alan J and Spellman, Paul T and
              Stunnenberg, Hendrik G and Wainwright, Brandon J and Wilson,
              Richard K and Yang, Huanming",
  abstract = "The International Cancer Genome Consortium (ICGC) was launched to
              coordinate large-scale cancer genome studies in tumours from 50
              different cancer types and/or subtypes that are of clinical and
              societal importance across the globe. Systematic studies of more
              than 25,000 cancer genomes at the genomic, epigenomic and
              transcriptomic levels will reveal the repertoire of oncogenic
              mutations, uncover traces of the mutagenic influences, define
              clinically relevant subtypes for prognosis and therapeutic
              management, and enable the development of new cancer therapies.",
  journal  = "Nature",
  volume   =  464,
  number   =  7291,
  pages    = "993--998",
  month    =  apr,
  year     =  2010,
  language = "en"
}

@ARTICLE{Tsuji2016-bj,
  title    = "Evaluation of preprocessing, mapping and postprocessing
              algorithms for analyzing whole genome bisulfite sequencing data",
  author   = "Tsuji, Junko and Weng, Zhiping",
  abstract = "Cytosine methylation regulates many biological processes such as
              gene expression, chromatin structure and chromosome stability.
              The whole genome bisulfite sequencing (WGBS) technique measures
              the methylation level at each cytosine throughout the genome.
              There are an increasing number of publicly available pipelines
              for analyzing WGBS data, reflecting many choices of read mapping
              algorithms as well as preprocessing and postprocessing methods.
              We simulated single-end and paired-end reads based on three
              experimental data sets, and comprehensively evaluated 192
              combinations of three preprocessing, five postprocessing and five
              widely used read mapping algorithms. We also compared paired-end
              data with single-end data at the same sequencing depth for
              performance of read mapping and methylation level estimation.
              Bismark and LAST were the most robust mapping algorithms. We
              found that Mott trimming and quality filtering individually
              improved the performance of both read mapping and methylation
              level estimation, but combining them did not lead to further
              improvement. Furthermore, we confirmed that paired-end sequencing
              reduced error rate and enhanced sensitivity for both read mapping
              and methylation level estimation, especially for short reads and
              in repetitive regions of the human genome.",
  journal  = "Brief. Bioinform.",
  volume   =  17,
  number   =  6,
  pages    = "938--952",
  month    =  nov,
  year     =  2016,
  keywords = "DNA methylation; WGBS analysis step evaluation; WGBS mapping
              software; read quality trimming; whole genome bisulfite
              sequencing",
  language = "en"
}

@ARTICLE{Grunau2001-gd,
  title    = "Bisulfite genomic sequencing: systematic investigation of
              critical experimental parameters",
  author   = "Grunau, C and Clark, S J and Rosenthal, A",
  abstract = "Bisulfite genomic sequencing is the method of choice for the
              generation of methylation maps with single-base resolution. The
              method is based on the selective deamination of cytosine to
              uracil by treatment with bisulfite and the sequencing of
              subsequently generated PCR products. In contrast to cytosine,
              5-methylcytosine does not react with bisulfite and can therefore
              be distinguished. In order to investigate the potential for
              optimization of the method and to determine the critical
              experimental parameters, we determined the influence of
              incubation time and incubation temperature on the deamination
              efficiency and measured the degree of DNA degradation during the
              bisulfite treatment. We found that maximum conversion rates of
              cytosine occurred at 55 degrees C (4-18 h) and 95 degrees C (1
              h). Under these conditions at least 84-96\% of the DNA is
              degraded. To study the impact of primer selection, homologous DNA
              templates were constructed possessing cytosine-containing and
              cytosine-free primer binding sites, respectively. The recognition
              rates for cytosine (>/=97\%) and 5-methylcytosine (>/=94\%) were
              found to be identical for both templates.",
  journal  = "Nucleic Acids Res.",
  volume   =  29,
  number   =  13,
  pages    = "E65--5",
  month    =  jul,
  year     =  2001,
  language = "en"
}

@ARTICLE{Meissner2005-ig,
  title    = "Reduced representation bisulfite sequencing for comparative
              high-resolution {DNA} methylation analysis",
  author   = "Meissner, Alexander and Gnirke, Andreas and Bell, George W and
              Ramsahoye, Bernard and Lander, Eric S and Jaenisch, Rudolf",
  abstract = "We describe a large-scale random approach termed reduced
              representation bisulfite sequencing (RRBS) for analyzing and
              comparing genomic methylation patterns. BglII restriction
              fragments were size-selected to 500-600 bp, equipped with
              adapters, treated with bisulfite, PCR amplified, cloned and
              sequenced. We constructed RRBS libraries from murine ES cells and
              from ES cells lacking DNA methyltransferases Dnmt3a and 3b and
              with knocked-down (kd) levels of Dnmt1 (Dnmt[1(kd),3a-/-,3b-/-]).
              Sequencing of 960 RRBS clones from Dnmt[1(kd),3a-/-,3b-/-] cells
              generated 343 kb of non-redundant bisulfite sequence covering
              66212 cytosines in the genome. All but 38 cytosines had been
              converted to uracil indicating a conversion rate of >99.9\%. Of
              the remaining cytosines 35 were found in CpG and 3 in CpT
              dinucleotides. Non-CpG methylation was >250-fold reduced compared
              with wild-type ES cells, consistent with a role for Dnmt3a and/or
              Dnmt3b in CpA and CpT methylation. Closer inspection revealed
              neither a consensus sequence around the methylated sites nor
              evidence for clustering of residual methylation in the genome.
              Our findings indicate random loss rather than specific
              maintenance of methylation in Dnmt[1(kd),3a-/-,3b-/-] cells.
              Near-complete bisulfite conversion and largely unbiased
              representation of RRBS libraries suggest that random shotgun
              bisulfite sequencing can be scaled to a genome-wide approach.",
  journal  = "Nucleic Acids Res.",
  volume   =  33,
  number   =  18,
  pages    = "5868--5877",
  month    =  oct,
  year     =  2005,
  language = "en"
}

@ARTICLE{Whittlesea1993-ns,
  title     = "Illusions of familiarity",
  author    = "Whittlesea, Bruce W A",
  abstract  = "Feelings of familiarity are not direct products of memory.
               Although prior experience of a stimulus can produce a feeling of
               familiarity, that feeling can also be aroused in the absence of
               prior experience if perceptual processing of the stimulus is
               fluent (e.g., B. W. Whittlesea et al, 1990). This suggests that
               feelings of familiarity arise through an unconscious inference
               about the source of processing fluency. The present experiments
               extend that conclusion. First, they show that a wide variety of
               feelings about the past are controlled by a fluency heuristic,
               including feelings about the meaning, pleasantness, duration,
               and recency of past events. Second, they demonstrate that the
               attribution process does not rely only on perceptual fluency,
               but can be influenced even more by the fluency of conceptual
               processing. Third, they show that although the fluency heuristic
               itself is simple, people's use of it is highly sophisticated and
               makes them robustly sensitive to the actual historical status of
               current events. (PsycINFO Database Record (c) 2016 APA, all
               rights reserved)",
  journal   = "J. Exp. Psychol. Learn. Mem. Cogn.",
  publisher = "American Psychological Association",
  volume    =  19,
  number    =  6,
  pages     = "1235",
  month     =  nov,
  year      =  1993,
  keywords  = "fluency heuristic, feelings \& illusions of familiarity, college
               students",
  language  = "en"
}

@ARTICLE{Fernandez2012-mg,
  title    = "A {DNA} methylation fingerprint of 1628 human samples",
  author   = "Fernandez, Agustin F and Assenov, Yassen and Martin-Subero, Jose
              Ignacio and Balint, Balazs and Siebert, Reiner and Taniguchi,
              Hiroaki and Yamamoto, Hiroyuki and Hidalgo, Manuel and Tan,
              Aik-Choon and Galm, Oliver and Ferrer, Isidre and
              Sanchez-Cespedes, Montse and Villanueva, Alberto and Carmona,
              Javier and Sanchez-Mut, Jose V and Berdasco, Maria and Moreno,
              Victor and Capella, Gabriel and Monk, David and Ballestar,
              Esteban and Ropero, Santiago and Martinez, Ramon and
              Sanchez-Carbayo, Marta and Prosper, Felipe and Agirre, Xabier and
              Fraga, Mario F and Gra{\~n}a, Osvaldo and Perez-Jurado, Luis and
              Mora, Jaume and Puig, Susana and Prat, Jaime and Badimon, Lina
              and Puca, Annibale A and Meltzer, Stephen J and Lengauer, Thomas
              and Bridgewater, John and Bock, Christoph and Esteller, Manel",
  abstract = "Most of the studies characterizing DNA methylation patterns have
              been restricted to particular genomic loci in a limited number of
              human samples and pathological conditions. Herein, we present a
              compromise between an extremely comprehensive study of a human
              sample population with an intermediate level of resolution of
              CpGs at the genomic level. We obtained a DNA methylation
              fingerprint of 1628 human samples in which we interrogated 1505
              CpG sites. The DNA methylation patterns revealed show this
              epigenetic mark to be critical in tissue-type definition and
              stemness, particularly around transcription start sites that are
              not within a CpG island. For disease, the generated DNA
              methylation fingerprints show that, during tumorigenesis, human
              cancer cells underwent a progressive gain of promoter CpG-island
              hypermethylation and a loss of CpG methylation in non-CpG-island
              promoters. Although transformed cells are those in which DNA
              methylation disruption is more obvious, we observed that other
              common human diseases, such as neurological and autoimmune
              disorders, had their own distinct DNA methylation profiles. Most
              importantly, we provide proof of principle that the DNA
              methylation fingerprints obtained might be useful for
              translational purposes by showing that we are able to identify
              the tumor type origin of cancers of unknown primary origin
              (CUPs). Thus, the DNA methylation patterns identified across the
              largest spectrum of samples, tissues, and diseases reported to
              date constitute a baseline for developing higher-resolution DNA
              methylation maps and provide important clues concerning the
              contribution of CpG methylation to tissue identity and its
              changes in the most prevalent human diseases.",
  journal  = "Genome Res.",
  volume   =  22,
  number   =  2,
  pages    = "407--419",
  month    =  feb,
  year     =  2012,
  language = "en"
}

@ARTICLE{Ball2009-at,
  title    = "Targeted and genome-scale strategies reveal gene-body methylation
              signatures in human cells",
  author   = "Ball, Madeleine P and Li, Jin Billy and Gao, Yuan and Lee,
              Je-Hyuk and LeProust, Emily M and Park, In-Hyun and Xie, Bin and
              Daley, George Q and Church, George M",
  abstract = "Studies of epigenetic modifications would benefit from improved
              methods for high-throughput methylation profiling. We introduce
              two complementary approaches that use next-generation sequencing
              technology to detect cytosine methylation. In the first method,
              we designed approximately 10,000 bisulfite padlock probes to
              profile approximately 7,000 CpG locations distributed over the
              ENCODE pilot project regions and applied them to human
              B-lymphocytes, fibroblasts and induced pluripotent stem cells.
              This unbiased choice of targets takes advantage of existing
              expression and chromatin immunoprecipitation data and enabled us
              to observe a pattern of low promoter methylation and high
              gene-body methylation in highly expressed genes. The second
              method, methyl-sensitive cut counting, generated nontargeted
              genome-scale data for approximately 1.4 million HpaII sites in
              the DNA of B-lymphocytes and confirmed that gene-body methylation
              in highly expressed genes is a consistent phenomenon throughout
              the human genome. Our observations highlight the usefulness of
              techniques that are not inherently or intentionally biased
              towards particular subsets like CpG islands or promoter regions.",
  journal  = "Nat. Biotechnol.",
  volume   =  27,
  number   =  4,
  pages    = "361--368",
  month    =  apr,
  year     =  2009,
  language = "en"
}

@ARTICLE{Clark2011-sc,
  title    = "Direct detection and sequencing of damaged {DNA} bases",
  author   = "Clark, Tyson A and Spittle, Kristi E and Turner, Stephen W and
              Korlach, Jonas",
  abstract = "Products of various forms of DNA damage have been implicated in a
              variety of important biological processes, such as aging,
              neurodegenerative diseases, and cancer. Therefore, there exists
              great interest to develop methods for interrogating damaged DNA
              in the context of sequencing. Here, we demonstrate that
              single-molecule, real-time (SMRT\textregistered{}) DNA sequencing
              can directly detect damaged DNA bases in the DNA template - as a
              by-product of the sequencing method - through an analysis of the
              DNA polymerase kinetics that are altered by the presence of a
              modified base. We demonstrate the sequencing of several DNA
              templates containing products of DNA damage, including
              8-oxoguanine, 8-oxoadenine, O6-methylguanine, 1-methyladenine,
              O4-methylthymine, 5-hydroxycytosine, 5-hydroxyuracil,
              5-hydroxymethyluracil, or thymine dimers, and show that these
              base modifications can be readily detected with
              single-modification resolution and DNA strand specificity. We
              characterize the distinct kinetic signatures generated by these
              DNA base modifications.",
  journal  = "Genome Integr.",
  volume   =  2,
  pages    = "10",
  month    =  dec,
  year     =  2011,
  language = "en"
}

@ARTICLE{Tran2014-mc,
  title    = "Objective and comprehensive evaluation of bisulfite short read
              mapping tools",
  author   = "Tran, Hong and Porter, Jacob and Sun, Ming-An and Xie, Hehuang
              and Zhang, Liqing",
  abstract = "Background. Large-scale bisulfite treatment and short reads
              sequencing technology allow comprehensive estimation of
              methylation states of Cs in the genomes of different tissues,
              cell types, and developmental stages. Accurate characterization
              of DNA methylation is essential for understanding genotype
              phenotype association, gene and environment interaction,
              diseases, and cancer. Aligning bisulfite short reads to a
              reference genome has been a challenging task. We compared five
              bisulfite short read mapping tools, BSMAP, Bismark, BS-Seeker,
              BiSS, and BRAT-BW, representing two classes of mapping algorithms
              (hash table and suffix/prefix tries). We examined their mapping
              efficiency (i.e., the percentage of reads that can be mapped to
              the genomes), usability, running time, and effects of changing
              default parameter settings using both real and simulated reads.
              We also investigated how preprocessing data might affect mapping
              efficiency. Conclusion. Among the five programs compared, in
              terms of mapping efficiency, Bismark performs the best on the
              real data, followed by BiSS, BSMAP, and finally BRAT-BW and
              BS-Seeker with very similar performance. If CPU time is not a
              constraint, Bismark is a good choice of program for mapping
              bisulfite treated short reads. Data quality impacts a great deal
              mapping efficiency. Although increasing the number of mismatches
              allowed can increase mapping efficiency, it not only
              significantly slows down the program, but also runs the risk of
              having increased false positives. Therefore, users should
              carefully set the related parameters depending on the quality of
              their sequencing data.",
  journal  = "Adv. Bioinformatics",
  volume   =  2014,
  pages    = "472045",
  month    =  apr,
  year     =  2014,
  language = "en"
}

@ARTICLE{Yang2015-dh,
  title   = "Systematic {DNA} methylation analysis of multiple cell lines
             reveals common and specific patterns within and across tissues of
             origin",
  author  = "Yang, Xiaofei and Shao, Xiaojian and Gao, Lin and Zhang, Shihua",
  journal = "Hum. Mol. Genet.",
  volume  =  24,
  number  =  15,
  pages   = "4374--4384",
  year    =  2015
}

@ARTICLE{Gaidatzis2014-st,
  title    = "{DNA} sequence explains seemingly disordered methylation levels
              in partially methylated domains of Mammalian genomes",
  author   = "Gaidatzis, Dimos and Burger, Lukas and Murr, Rabih and Lerch,
              Anita and Dessus-Babus, Sophie and Sch{\"u}beler, Dirk and
              Stadler, Michael B",
  abstract = "For the most part metazoan genomes are highly methylated and
              harbor only small regions with low or absent methylation. In
              contrast, partially methylated domains (PMDs), recently
              discovered in a variety of cell lines and tissues, do not fit
              this paradigm as they show partial methylation for large portions
              (20\%-40\%) of the genome. While in PMDs methylation levels are
              reduced on average, we found that at single CpG resolution, they
              show extensive variability along the genome outside of CpG
              islands and DNase I hypersensitive sites (DHS). Methylation
              levels range from 0\% to 100\% in a roughly uniform fashion with
              only little similarity between neighboring CpGs. A comparison of
              various PMD-containing methylomes showed that these seemingly
              disordered states of methylation are strongly conserved across
              cell types for virtually every PMD. Comparative sequence analysis
              suggests that DNA sequence is a major determinant of these
              methylation states. This is further substantiated by a purely
              sequence based model which can predict 31\% (R(2)) of the
              variation in methylation. The model revealed CpG density as the
              main driving feature promoting methylation, opposite to what has
              been shown for CpG islands, followed by various dinucleotides
              immediately flanking the CpG and a minor contribution from
              sequence preferences reflecting nucleosome positioning. Taken
              together we provide a reinterpretation for the
              nucleotide-specific methylation levels observed in PMDs,
              demonstrate their conservation across tissues and suggest that
              they are mainly determined by specific DNA sequence features.",
  journal  = "PLoS Genet.",
  volume   =  10,
  number   =  2,
  pages    = "e1004143",
  month    =  feb,
  year     =  2014,
  language = "en"
}

@ARTICLE{Storey2003-nv,
  title    = "Statistical significance for genomewide studies",
  author   = "Storey, John D and Tibshirani, Robert",
  abstract = "With the increase in genomewide experiments and the sequencing of
              multiple genomes, the analysis of large data sets has become
              commonplace in biology. It is often the case that thousands of
              features in a genomewide data set are tested against some null
              hypothesis, where a number of features are expected to be
              significant. Here we propose an approach to measuring statistical
              significance in these genomewide studies based on the concept of
              the false discovery rate. This approach offers a sensible balance
              between the number of true and false positives that is
              automatically calibrated and easily interpreted. In doing so, a
              measure of statistical significance called the q value is
              associated with each tested feature. The q value is similar to
              the well known p value, except it is a measure of significance in
              terms of the false discovery rate rather than the false positive
              rate. Our approach avoids a flood of false positive results,
              while offering a more liberal criterion than what has been used
              in genome scans for linkage.",
  journal  = "Proc. Natl. Acad. Sci. U. S. A.",
  volume   =  100,
  number   =  16,
  pages    = "9440--9445",
  month    =  aug,
  year     =  2003,
  language = "en"
}

@ARTICLE{Roadmap_Epigenomics_Consortium2015-cj,
  title    = "Integrative analysis of 111 reference human epigenomes",
  author   = "{Roadmap Epigenomics Consortium} and Kundaje, Anshul and
              Meuleman, Wouter and Ernst, Jason and Bilenky, Misha and Yen,
              Angela and Heravi-Moussavi, Alireza and Kheradpour, Pouya and
              Zhang, Zhizhuo and Wang, Jianrong and Ziller, Michael J and Amin,
              Viren and Whitaker, John W and Schultz, Matthew D and Ward, Lucas
              D and Sarkar, Abhishek and Quon, Gerald and Sandstrom, Richard S
              and Eaton, Matthew L and Wu, Yi-Chieh and Pfenning, Andreas R and
              Wang, Xinchen and Claussnitzer, Melina and Liu, Yaping and
              Coarfa, Cristian and Harris, R Alan and Shoresh, Noam and
              Epstein, Charles B and Gjoneska, Elizabeta and Leung, Danny and
              Xie, Wei and Hawkins, R David and Lister, Ryan and Hong, Chibo
              and Gascard, Philippe and Mungall, Andrew J and Moore, Richard
              and Chuah, Eric and Tam, Angela and Canfield, Theresa K and
              Hansen, R Scott and Kaul, Rajinder and Sabo, Peter J and Bansal,
              Mukul S and Carles, Annaick and Dixon, Jesse R and Farh, Kai-How
              and Feizi, Soheil and Karlic, Rosa and Kim, Ah-Ram and Kulkarni,
              Ashwinikumar and Li, Daofeng and Lowdon, Rebecca and Elliott,
              Ginell and Mercer, Tim R and Neph, Shane J and Onuchic, Vitor and
              Polak, Paz and Rajagopal, Nisha and Ray, Pradipta and Sallari,
              Richard C and Siebenthall, Kyle T and Sinnott-Armstrong, Nicholas
              A and Stevens, Michael and Thurman, Robert E and Wu, Jie and
              Zhang, Bo and Zhou, Xin and Beaudet, Arthur E and Boyer, Laurie A
              and De Jager, Philip L and Farnham, Peggy J and Fisher, Susan J
              and Haussler, David and Jones, Steven J M and Li, Wei and Marra,
              Marco A and McManus, Michael T and Sunyaev, Shamil and Thomson,
              James A and Tlsty, Thea D and Tsai, Li-Huei and Wang, Wei and
              Waterland, Robert A and Zhang, Michael Q and Chadwick, Lisa H and
              Bernstein, Bradley E and Costello, Joseph F and Ecker, Joseph R
              and Hirst, Martin and Meissner, Alexander and Milosavljevic,
              Aleksandar and Ren, Bing and Stamatoyannopoulos, John A and Wang,
              Ting and Kellis, Manolis",
  abstract = "The reference human genome sequence set the stage for studies of
              genetic variation and its association with human disease, but
              epigenomic studies lack a similar reference. To address this
              need, the NIH Roadmap Epigenomics Consortium generated the
              largest collection so far of human epigenomes for primary cells
              and tissues. Here we describe the integrative analysis of 111
              reference human epigenomes generated as part of the programme,
              profiled for histone modification patterns, DNA accessibility,
              DNA methylation and RNA expression. We establish global maps of
              regulatory elements, define regulatory modules of coordinated
              activity, and their likely activators and repressors. We show
              that disease- and trait-associated genetic variants are enriched
              in tissue-specific epigenomic marks, revealing biologically
              relevant cell types for diverse human traits, and providing a
              resource for interpreting the molecular basis of human disease.
              Our results demonstrate the central role of epigenomic
              information for understanding gene regulation, cellular
              differentiation and human disease.",
  journal  = "Nature",
  volume   =  518,
  number   =  7539,
  pages    = "317--330",
  month    =  feb,
  year     =  2015,
  language = "en"
}

@ARTICLE{Akalin2012-af,
  title    = "methylKit: a comprehensive {R} package for the analysis of
              genome-wide {DNA} methylation profiles",
  author   = "Akalin, Altuna and Kormaksson, Matthias and Li, Sheng and
              Garrett-Bakelman, Francine E and Figueroa, Maria E and Melnick,
              Ari and Mason, Christopher E",
  abstract = "DNA methylation is a chemical modification of cytosine bases that
              is pivotal for gene regulation, cellular specification and cancer
              development. Here, we describe an R package, methylKit, that
              rapidly analyzes genome-wide cytosine epigenetic profiles from
              high-throughput methylation and hydroxymethylation sequencing
              experiments. methylKit includes functions for clustering, sample
              quality visualization, differential methylation analysis and
              annotation features, thus automating and simplifying many of the
              steps for discerning statistically significant bases or regions
              of DNA methylation. Finally, we demonstrate methylKit on breast
              cancer data, in which we find statistically significant regions
              of differential methylation and stratify tumor subtypes.
              methylKit is available at http://code.google.com/p/methylkit.",
  journal  = "Genome Biol.",
  volume   =  13,
  number   =  10,
  pages    = "R87",
  month    =  oct,
  year     =  2012,
  language = "en"
}

@ARTICLE{Li2015-mz,
  title    = "Post-conversion targeted capture of modified cytosines in
              mammalian and plant genomes",
  author   = "Li, Qing and Suzuki, Masako and Wendt, Jennifer and Patterson,
              Nicole and Eichten, Steven R and Hermanson, Peter J and Green,
              Dawn and Jeddeloh, Jeffrey and Richmond, Todd and Rosenbaum,
              Heidi and Burgess, Daniel and Springer, Nathan M and Greally,
              John M",
  abstract = "We present a capture-based approach for bisulfite-converted DNA
              that allows interrogation of pre-defined genomic locations,
              allowing quantitative and qualitative assessments of
              5-methylcytosine (5mC) and 5-hydroxymethylcytosine (5hmC) at CG
              dinucleotides and in non-CG contexts (CHG, CHH) in mammalian and
              plant genomes. We show the technique works robustly and
              reproducibly using as little as 500 ng of starting DNA, with
              results correlating well with whole genome bisulfite sequencing
              data, and demonstrate that human DNA can be tested in samples
              contaminated with microbial DNA. This targeting approach will
              allow cell type-specific designs to maximize the value of 5mC and
              5hmC sequencing.",
  journal  = "Nucleic Acids Res.",
  volume   =  43,
  number   =  12,
  pages    = "e81",
  month    =  jul,
  year     =  2015,
  language = "en"
}

@ARTICLE{Wu2015-wt,
  title    = "Detection of differentially methylated regions from whole-genome
              bisulfite sequencing data without replicates",
  author   = "Wu, Hao and Xu, Tianlei and Feng, Hao and Chen, Li and Li, Ben
              and Yao, Bing and Qin, Zhaohui and Jin, Peng and Conneely, Karen
              N",
  abstract = "DNA methylation is an important epigenetic modification involved
              in many biological processes and diseases. Recent developments in
              whole genome bisulfite sequencing (WGBS) technology have enabled
              genome-wide measurements of DNA methylation at single base pair
              resolution. Many experiments have been conducted to compare DNA
              methylation profiles under different biological contexts, with
              the goal of identifying differentially methylated regions (DMRs).
              Due to the high cost of WGBS experiments, many studies are still
              conducted without biological replicates. Methods and tools
              available for analyzing such data are very limited.We develop a
              statistical method, DSS-single, for detecting DMRs from WGBS data
              without replicates. We characterize the count data using a
              rigorous model that accounts for the spatial correlation of
              methylation levels, sequence depth and biological variation. We
              demonstrate that using information from neighboring CG sites,
              biological variation can be estimated accurately even without
              replicates. DMR detection is then carried out via a Wald test
              procedure. Simulations demonstrate that DSS-single has greater
              sensitivity and accuracy than existing methods, and an analysis
              of H1 versus IMR90 cell lines suggests that it also yields the
              most biologically meaningful results. DSS-single is implemented
              in the Bioconductor package DSS.",
  journal  = "Nucleic Acids Res.",
  volume   =  43,
  number   =  21,
  pages    = "e141",
  month    =  dec,
  year     =  2015,
  language = "en"
}

@ARTICLE{ENCODE_Project_Consortium2012-wf,
  title    = "An integrated encyclopedia of {DNA} elements in the human genome",
  author   = "{ENCODE Project Consortium}",
  abstract = "The human genome encodes the blueprint of life, but the function
              of the vast majority of its nearly three billion bases is
              unknown. The Encyclopedia of DNA Elements (ENCODE) project has
              systematically mapped regions of transcription, transcription
              factor association, chromatin structure and histone modification.
              These data enabled us to assign biochemical functions for 80\% of
              the genome, in particular outside of the well-studied
              protein-coding regions. Many discovered candidate regulatory
              elements are physically associated with one another and with
              expressed genes, providing new insights into the mechanisms of
              gene regulation. The newly identified elements also show a
              statistical correspondence to sequence variants linked to human
              disease, and can thereby guide interpretation of this variation.
              Overall, the project provides new insights into the organization
              and regulation of our genes and genome, and is an expansive
              resource of functional annotations for biomedical research.",
  journal  = "Nature",
  volume   =  489,
  number   =  7414,
  pages    = "57--74",
  month    =  sep,
  year     =  2012,
  language = "en"
}

@ARTICLE{Dolzhenko2014-mk,
  title    = "Using beta-binomial regression for high-precision differential
              methylation analysis in multifactor whole-genome bisulfite
              sequencing experiments",
  author   = "Dolzhenko, Egor and Smith, Andrew D",
  abstract = "BACKGROUND: Whole-genome bisulfite sequencing currently provides
              the highest-precision view of the epigenome, with quantitative
              information about populations of cells down to single nucleotide
              resolution. Several studies have demonstrated the value of this
              precision: meaningful features that correlate strongly with
              biological functions can be found associated with only a few CpG
              sites. Understanding the role of DNA methylation, and more
              broadly the role of DNA accessibility, requires that methylation
              differences between populations of cells are identified with
              extreme precision and in complex experimental designs. RESULTS:
              In this work we investigated the use of beta-binomial regression
              as a general approach for modeling whole-genome bisulfite data to
              identify differentially methylated sites and genomic intervals.
              CONCLUSIONS: The regression-based analysis can handle medium- and
              large-scale experiments where it becomes critical to accurately
              model variation in methylation levels between replicates and
              account for influence of various experimental factors like cell
              types or batch effects.",
  journal  = "BMC Bioinformatics",
  volume   =  15,
  pages    = "215",
  month    =  jun,
  year     =  2014,
  language = "en"
}

@ARTICLE{Xi2009-wf,
  title    = "{BSMAP}: whole genome bisulfite sequence {MAPping} program",
  author   = "Xi, Yuanxin and Li, Wei",
  abstract = "BACKGROUND: Bisulfite sequencing is a powerful technique to study
              DNA cytosine methylation. Bisulfite treatment followed by PCR
              amplification specifically converts unmethylated cytosines to
              thymine. Coupled with next generation sequencing technology, it
              is able to detect the methylation status of every cytosine in the
              genome. However, mapping high-throughput bisulfite reads to the
              reference genome remains a great challenge due to the increased
              searching space, reduced complexity of bisulfite sequence,
              asymmetric cytosine to thymine alignments, and multiple CpG
              heterogeneous methylation. RESULTS: We developed an efficient
              bisulfite reads mapping algorithm BSMAP to address the above
              issues. BSMAP combines genome hashing and bitwise masking to
              achieve fast and accurate bisulfite mapping. Compared with
              existing bisulfite mapping approaches, BSMAP is faster, more
              sensitive and more flexible. CONCLUSION: BSMAP is the first
              general-purpose bisulfite mapping software. It is able to map
              high-throughput bisulfite reads at whole genome level with
              feasible memory and CPU usage. It is freely available under GPL
              v3 license at http://code.google.com/p/bsmap/.",
  journal  = "BMC Bioinformatics",
  volume   =  10,
  pages    = "232",
  month    =  jul,
  year     =  2009,
  language = "en"
}

@ARTICLE{Bonev2016-sk,
  title    = "Organization and function of the {3D} genome",
  author   = "Bonev, Boyan and Cavalli, Giacomo",
  journal  = "Nat. Rev. Genet.",
  volume   =  17,
  number   =  12,
  pages    = "772",
  month    =  dec,
  year     =  2016,
  language = "en"
}

@ARTICLE{Song2013-an,
  title    = "Genome-wide profiling of 5-formylcytosine reveals its roles in
              epigenetic priming",
  author   = "Song, Chun-Xiao and Szulwach, Keith E and Dai, Qing and Fu, Ye
              and Mao, Shi-Qing and Lin, Li and Street, Craig and Li, Yujing
              and Poidevin, Mickael and Wu, Hao and Gao, Juan and Liu, Peng and
              Li, Lin and Xu, Guo-Liang and Jin, Peng and He, Chuan",
  abstract = "TET proteins oxidize 5-methylcytosine (5mC) to
              5-hydroxymethylcytosine (5hmC), 5-formylcytosine (5fC), and
              5-carboxylcytosine (5caC). 5fC and 5caC are excised by mammalian
              DNA glycosylase TDG, implicating 5mC oxidation in DNA
              demethylation. Here, we show that the genomic locations of 5fC
              can be determined by coupling chemical reduction with biotin
              tagging. Genome-wide mapping of 5fC in mouse embryonic stem cells
              (mESCs) reveals that 5fC preferentially occurs at poised
              enhancers among other gene regulatory elements. Application to
              Tdg null mESCs further suggests that 5fC production coordinates
              with p300 in remodeling epigenetic states of enhancers. This
              process, which is not influenced by 5hmC, appears to be
              associated with further oxidation of 5hmC and commitment to
              demethylation through 5fC. Finally, we resolved 5fC at base
              resolution by hydroxylamine-based protection from
              bisulfite-mediated deamination, thereby confirming sites of 5fC
              accumulation. Our results reveal roles of active 5mC/5hmC
              oxidation and TDG-mediated demethylation in epigenetic tuning at
              regulatory elements.",
  journal  = "Cell",
  volume   =  153,
  number   =  3,
  pages    = "678--691",
  month    =  apr,
  year     =  2013,
  language = "en"
}

@ARTICLE{Ivanov2013-se,
  title    = "In-solution hybrid capture of bisulfite-converted {DNA} for
              targeted bisulfite sequencing of 174 {ADME} genes",
  author   = "Ivanov, Maxim and Kals, Mart and Kacevska, Marina and Metspalu,
              Andres and Ingelman-Sundberg, Magnus and Milani, Lili",
  abstract = "DNA methylation is one of the most important epigenetic
              alterations involved in the control of gene expression. Bisulfite
              sequencing of genomic DNA is currently the only method to study
              DNA methylation patterns at single-nucleotide resolution. Hence,
              next-generation sequencing of bisulfite-converted DNA is the
              method of choice to investigate DNA methylation profiles at the
              genome-wide scale. Nevertheless, whole genome sequencing for
              analysis of human methylomes is expensive, and a method for
              targeted gene analysis would provide a good alternative in many
              cases where the primary interest is restricted to a set of genes.
              Here, we report the successful use of a custom Agilent SureSelect
              Target Enrichment system for the hybrid capture of
              bisulfite-converted DNA. We prepared bisulfite-converted
              next-generation sequencing libraries, which are enriched for the
              coding and regulatory regions of 174 ADME genes (i.e. genes
              involved in the metabolism and distribution of drugs). Sequencing
              of these libraries on Illumina's HiSeq2000 revealed that the
              method allows a reliable quantification of methylation levels of
              CpG sites in the selected genes, and validation of the method
              using pyrosequencing and the Illumina 450K methylation BeadChips
              revealed good concordance.",
  journal  = "Nucleic Acids Res.",
  volume   =  41,
  number   =  6,
  pages    = "e72",
  month    =  apr,
  year     =  2013,
  language = "en"
}

@MISC{Beck_undated-kw,
  title   = "Faculty of 1000 evaluation for Distinct {DNA} methylomes of
             newborns and centenarians",
  author  = "Beck, Stephan",
  journal = "F1000 - Post-publication peer review of the biomedical literature"
}

@ARTICLE{Pedersen2011-hq,
  title    = "{MethylCoder}: software pipeline for bisulfite-treated sequences",
  author   = "Pedersen, Brent and Hsieh, Tzung-Fu and Ibarra, Christian and
              Fischer, Robert L",
  abstract = "MOTIVATION: MethylCoder is a software program that generates
              per-base methylation data given a set of bisulfite-treated reads.
              It provides the option to use either of two existing short-read
              aligners, each with different strengths. It accounts for
              soft-masked alignments and overlapping paired-end reads.
              MethylCoder outputs data in text and binary formats in addition
              to the final alignment in SAM format, so that common
              high-throughput sequencing tools can be used on the resulting
              output. It is more flexible than existing software and
              competitive in terms of speed and memory use. AVAILABILITY:
              MethylCoder requires only a python interpreter and a C compiler
              to run. Extensive documentation and the full source code are
              available under the MIT license at:
              https://github.com/brentp/methylcode. CONTACT:
              bpederse@gmail.com.",
  journal  = "Bioinformatics",
  volume   =  27,
  number   =  17,
  pages    = "2435--2436",
  month    =  sep,
  year     =  2011,
  language = "en"
}

@ARTICLE{Huang2010-qi,
  title    = "The behaviour of 5-hydroxymethylcytosine in bisulfite sequencing",
  author   = "Huang, Yun and Pastor, William A and Shen, Yinghua and Tahiliani,
              Mamta and Liu, David R and Rao, Anjana",
  abstract = "BACKGROUND: We recently showed that enzymes of the TET family
              convert 5-mC to 5-hydroxymethylcytosine (5-hmC) in DNA. 5-hmC is
              present at high levels in embryonic stem cells and Purkinje
              neurons. The methylation status of cytosines is typically
              assessed by reaction with sodium bisulfite followed by PCR
              amplification. Reaction with sodium bisulfite promotes cytosine
              deamination, whereas 5-methylcytosine (5-mC) reacts poorly with
              bisulfite and is resistant to deamination. Since 5-hmC reacts
              with bisulfite to yield cytosine 5-methylenesulfonate (CMS), we
              asked how DNA containing 5-hmC behaves in bisulfite sequencing.
              METHODOLOGY/PRINCIPAL FINDINGS: We used synthetic
              oligonucleotides with different distributions of cytosine as
              templates for generation of DNAs containing C, 5-mC and 5-hmC.
              The resulting DNAs were subjected in parallel to bisulfite
              treatment, followed by exposure to conditions promoting cytosine
              deamination. The extent of conversion of 5-hmC to CMS was
              estimated to be 99.7\%. Sequencing of PCR products showed that
              neither 5-mC nor 5-hmC undergo C-to-T transitions after bisulfite
              treatment, confirming that these two modified cytosine species
              are indistinguishable by the bisulfite technique. DNA in which
              CMS constituted a large fraction of all bases (28/201) was much
              less efficiently amplified than DNA in which those bases were
              5-mC or uracil (the latter produced by cytosine deamination).
              Using a series of primer extension experiments, we traced the
              inefficient amplification of CMS-containing DNA to stalling of
              Taq polymerase at sites of CMS modification, especially when two
              CMS bases were either adjacent to one another or separated by 1-2
              nucleotides. CONCLUSIONS: We have confirmed that the widely used
              bisulfite sequencing technique does not distinguish between 5-mC
              and 5-hmC. Moreover, we show that CMS, the product of bisulfite
              conversion of 5-hmC, tends to stall DNA polymerases during PCR,
              suggesting that densely hydroxymethylated regions of DNA may be
              underrepresented in quantitative methylation analyses.",
  journal  = "PLoS One",
  volume   =  5,
  number   =  1,
  pages    = "e8888",
  month    =  jan,
  year     =  2010,
  language = "en"
}

@ARTICLE{Li2013-zx,
  title    = "An optimized algorithm for detecting and annotating regional
              differential methylation",
  author   = "Li, Sheng and Garrett-Bakelman, Francine E and Akalin, Altuna and
              Zumbo, Paul and Levine, Ross and To, Bik L and Lewis, Ian D and
              Brown, Anna L and D'Andrea, Richard J and Melnick, Ari and Mason,
              Christopher E",
  abstract = "BACKGROUND: DNA methylation profiling reveals important
              differentially methylated regions (DMRs) of the genome that are
              altered during development or that are perturbed by disease. To
              date, few programs exist for regional analysis of enriched or
              whole-genome bisulfate conversion sequencing data, even though
              such data are increasingly common. Here, we describe an
              open-source, optimized method for determining empirically based
              DMRs (eDMR) from high-throughput sequence data that is applicable
              to enriched whole-genome methylation profiling datasets, as well
              as other globally enriched epigenetic modification data. RESULTS:
              Here we show that our bimodal distribution model and weighted
              cost function for optimized regional methylation analysis
              provides accurate boundaries of regions harboring significant
              epigenetic modifications. Our algorithm takes the spatial
              distribution of CpGs into account for the enrichment assay,
              allowing for optimization of the definition of empirical regions
              for differential methylation. Combined with the dependent
              adjustment for regional p-value combination and DMR annotation,
              we provide a method that may be applied to a variety of datasets
              for rapid DMR analysis. Our method classifies both the
              directionality of DMRs and their genome-wide distribution, and we
              have observed that shows clinical relevance through correct
              stratification of two Acute Myeloid Leukemia (AML) tumor
              sub-types. CONCLUSIONS: Our weighted optimization algorithm eDMR
              for calling DMRs extends an established DMR R pipeline
              (methylKit) and provides a needed resource in epigenomics. Our
              method enables an accurate and scalable way of finding DMRs in
              high-throughput methylation sequencing experiments. eDMR is
              available for download at http://code.google.com/p/edmr/.",
  journal  = "BMC Bioinformatics",
  volume   = "14 Suppl 5",
  pages    = "S10",
  month    =  apr,
  year     =  2013,
  language = "en"
}

@ARTICLE{Ehrlich2002-hv,
  title   = "{DNA} methylation in cancer: too much, but also too little",
  author  = "Ehrlich, Melanie",
  journal = "Oncogene",
  volume  =  21,
  number  =  35,
  pages   = "5400--5413",
  year    =  2002
}

@ARTICLE{Meissner2008-so,
  title    = "Genome-scale {DNA} methylation maps of pluripotent and
              differentiated cells",
  author   = "Meissner, Alexander and Mikkelsen, Tarjei S and Gu, Hongcang and
              Wernig, Marius and Hanna, Jacob and Sivachenko, Andrey and Zhang,
              Xiaolan and Bernstein, Bradley E and Nusbaum, Chad and Jaffe,
              David B and Gnirke, Andreas and Jaenisch, Rudolf and Lander, Eric
              S",
  abstract = "DNA methylation is essential for normal development and has been
              implicated in many pathologies including cancer. Our knowledge
              about the genome-wide distribution of DNA methylation, how it
              changes during cellular differentiation and how it relates to
              histone methylation and other chromatin modifications in mammals
              remains limited. Here we report the generation and analysis of
              genome-scale DNA methylation profiles at nucleotide resolution in
              mammalian cells. Using high-throughput reduced representation
              bisulphite sequencing and single-molecule-based sequencing, we
              generated DNA methylation maps covering most CpG islands, and a
              representative sampling of conserved non-coding elements,
              transposons and other genomic features, for mouse embryonic stem
              cells, embryonic-stem-cell-derived and primary neural cells, and
              eight other primary tissues. Several key findings emerge from the
              data. First, DNA methylation patterns are better correlated with
              histone methylation patterns than with the underlying genome
              sequence context. Second, methylation of CpGs are dynamic
              epigenetic marks that undergo extensive changes during cellular
              differentiation, particularly in regulatory regions outside of
              core promoters. Third, analysis of embryonic-stem-cell-derived
              and primary cells reveals that 'weak' CpG islands associated with
              a specific set of developmentally regulated genes undergo
              aberrant hypermethylation during extended proliferation in vitro,
              in a pattern reminiscent of that reported in some primary
              tumours. More generally, the results establish reduced
              representation bisulphite sequencing as a powerful technology for
              epigenetic profiling of cell populations relevant to
              developmental biology, cancer and regenerative medicine.",
  journal  = "Nature",
  volume   =  454,
  number   =  7205,
  pages    = "766--770",
  month    =  aug,
  year     =  2008,
  language = "en"
}

@ARTICLE{Ritchie2015-ho,
  title    = "limma powers differential expression analyses for
              {RNA-sequencing} and microarray studies",
  author   = "Ritchie, Matthew E and Phipson, Belinda and Wu, Di and Hu, Yifang
              and Law, Charity W and Shi, Wei and Smyth, Gordon K",
  abstract = "limma is an R/Bioconductor software package that provides an
              integrated solution for analysing data from gene expression
              experiments. It contains rich features for handling complex
              experimental designs and for information borrowing to overcome
              the problem of small sample sizes. Over the past decade, limma
              has been a popular choice for gene discovery through differential
              expression analyses of microarray and high-throughput PCR data.
              The package contains particularly strong facilities for reading,
              normalizing and exploring such data. Recently, the capabilities
              of limma have been significantly expanded in two important
              directions. First, the package can now perform both differential
              expression and differential splicing analyses of RNA sequencing
              (RNA-seq) data. All the downstream analysis tools previously
              restricted to microarray data are now available for RNA-seq as
              well. These capabilities allow users to analyse both RNA-seq and
              microarray data with very similar pipelines. Second, the package
              is now able to go past the traditional gene-wise expression
              analyses in a variety of ways, analysing expression profiles in
              terms of co-regulated sets of genes or in terms of higher-order
              expression signatures. This provides enhanced possibilities for
              biological interpretation of gene expression differences. This
              article reviews the philosophy and design of the limma package,
              summarizing both new and historical features, with an emphasis on
              recent enhancements and features that have not been previously
              described.",
  journal  = "Nucleic Acids Res.",
  volume   =  43,
  number   =  7,
  pages    = "e47",
  month    =  apr,
  year     =  2015,
  language = "en"
}

@ARTICLE{Smith2013-jh,
  title   = "{DNA} methylation: roles in mammalian development",
  author  = "Smith, Zachary D and Meissner, Alexander",
  journal = "Nat. Rev. Genet.",
  volume  =  14,
  number  =  3,
  pages   = "204--220",
  year    =  2013
}

@ARTICLE{Guo2013-mi,
  title    = "{BS-Seeker2}: a versatile aligning pipeline for bisulfite
              sequencing data",
  author   = "Guo, Weilong and Fiziev, Petko and Yan, Weihong and Cokus, Shawn
              and Sun, Xueguang and Zhang, Michael Q and Chen, Pao-Yang and
              Pellegrini, Matteo",
  abstract = "BACKGROUND: DNA methylation is an important epigenetic
              modification involved in many biological processes. Bisulfite
              treatment coupled with high-throughput sequencing provides an
              effective approach for studying genome-wide DNA methylation at
              base resolution. Libraries such as whole genome bisulfite
              sequencing (WGBS) and reduced represented bisulfite sequencing
              (RRBS) are widely used for generating DNA methylomes, demanding
              efficient and versatile tools for aligning bisulfite sequencing
              data. RESULTS: We have developed BS-Seeker2, an updated version
              of BS Seeker, as a full pipeline for mapping bisulfite sequencing
              data and generating DNA methylomes. BS-Seeker2 improves
              mappability over existing aligners by using local alignment. It
              can also map reads from RRBS library by building special indexes
              with improved efficiency and accuracy. Moreover, BS-Seeker2
              provides additional function for filtering out reads with
              incomplete bisulfite conversion, which is useful in minimizing
              the overestimation of DNA methylation levels. We also defined
              CGmap and ATCGmap file formats for full representations of DNA
              methylomes, as part of the outputs of BS-Seeker2 pipeline
              together with BAM and WIG files. CONCLUSIONS: Our evaluations on
              the performance show that BS-Seeker2 works efficiently and
              accurately for both WGBS data and RRBS data. BS-Seeker2 is freely
              available at http://pellegrini.mcdb.ucla.edu/BS\_Seeker2/ and the
              Galaxy server.",
  journal  = "BMC Genomics",
  volume   =  14,
  pages    = "774",
  month    =  nov,
  year     =  2013,
  language = "en"
}

@ARTICLE{Taylor2007-ek,
  title    = "Ultradeep bisulfite sequencing analysis of {DNA} methylation
              patterns in multiple gene promoters by 454 sequencing",
  author   = "Taylor, Kristen H and Kramer, Robin S and Davis, J Wade and Guo,
              Juyuan and Duff, Deiter J and Xu, Dong and Caldwell, Charles W
              and Shi, Huidong",
  abstract = "We developed a novel approach for conducting multisample,
              multigene, ultradeep bisulfite sequencing analysis of DNA
              methylation patterns in clinical samples. A massively parallel
              sequencing-by-synthesis method (454 sequencing) was used to
              directly sequence >100 bisulfite PCR products in a single
              sequencing run without subcloning. We showed the utility,
              robustness, and superiority of this approach by analyzing
              methylation in 25 gene-related CpG rich regions from >40 cases of
              primary cells, including normal peripheral blood lymphocytes,
              acute lymphoblastic leukemia (ALL), chronic lymphocytic leukemia
              (CLL), follicular lymphoma (FL), and mantle cell lymphoma (MCL).
              A total of 294,631 sequences was generated with an average read
              length of 131 bp. On average, >1,600 individual sequences were
              generated for each PCR amplicon far beyond the few clones (<20)
              typically analyzed by traditional bisulfite sequencing.
              Comprehensive analysis of CpG methylation patterns at a single
              DNA molecule level using clustering algorithms revealed
              differential methylation patterns between diseases. A significant
              increase in methylation was detected in ALL and FL samples
              compared with CLL and MCL. Furthermore, a progressive spreading
              of methylation was detected from the periphery toward the center
              of select CpG islands in the ALL and FL samples. The ultradeep
              sequencing also allowed simultaneous analysis of genetic and
              epigenetic data and revealed an association between a single
              nucleotide polymorphism and the methylation present in the LRP1B
              promoter. This new generation of methylome sequencing will
              provide digital profiles of aberrant DNA methylation for
              individual human cancers and offers a robust method for the
              epigenetic classification of tumor subtypes.",
  journal  = "Cancer Res.",
  volume   =  67,
  number   =  18,
  pages    = "8511--8518",
  month    =  sep,
  year     =  2007,
  language = "en"
}

@ARTICLE{Daca-Roszak2015-au,
  title    = "Impact of {SNPs} on methylation readouts by Illumina Infinium
              {HumanMethylation450} {BeadChip} Array: implications for
              comparative population studies",
  author   = "Daca-Roszak, Patrycja and Pfeifer, Aleksandra and
              {\.Z}ebracka-Gala, Jadwiga and Rusinek, Dagmara and
              Szybi{\'n}ska, Aleksandra and Jarz{\k a}b, Barbara and Witt,
              Micha{\l} and Zi{\k e}tkiewicz, Ewa",
  abstract = "BACKGROUND: Infinium HumanMethylation 450 BeadChip Arrays by
              Illumina (Illumina HM450K) are among the most popular CpG
              microarray platforms widely used in biological and medical
              research. Several recent studies highlighted the potentially
              confounding impact of the genomic variation on the results of
              methylation studies performed using Illumina's Infinium
              methylation probes. However, the complexity of SNPs impact on the
              methylation level measurements ($\beta$ values) has not been
              comprehensively described. RESULTS: In our comparative study of
              European and Asian populations performed using Illumina HM450K,
              we found that the majority of Infinium probes, which
              differentiated two examined groups, had SNPs in their target
              sequence. Characteristic tri-modal or bi-modal patterns of
              $\beta$ values distribution among individual samples were
              observed for CpGs with SNPs in the first and second position,
              respectively. To better understand how SNPs affect methylation
              readouts, we investigated their impact in the context of SNP
              position and type, and of the Illumina probe type (Infinium I or
              II). CONCLUSIONS: Our study clearly demonstrates that SNP
              variation existing in the genome, if not accounted for, may lead
              to false interpretation of the methylation signal differences
              suggested by some of the Illumina Infinium probes. In addition,
              it provides important practical clues for discriminating between
              differences due to the methylation status and to the genomic
              polymorphisms, based on the inspection of methylation readouts in
              individual samples. This approach is of special importance when
              Illumina Infinium assay is used for any comparative population
              studies, whether related to cancer, disease, ethnicity where SNP
              frequencies differentiate the studied groups.",
  journal  = "BMC Genomics",
  volume   =  16,
  pages    = "1003",
  month    =  nov,
  year     =  2015,
  language = "en"
}

@ARTICLE{Assenov2014-ti,
  title    = "Comprehensive analysis of {DNA} methylation data with {RnBeads}",
  author   = "Assenov, Yassen and M{\"u}ller, Fabian and Lutsik, Pavlo and
              Walter, J{\"o}rn and Lengauer, Thomas and Bock, Christoph",
  abstract = "RnBeads is a software tool for large-scale analysis and
              interpretation of DNA methylation data, providing a user-friendly
              analysis workflow that yields detailed hypertext reports
              (http://rnbeads.mpi-inf.mpg.de/). Supported assays include
              whole-genome bisulfite sequencing, reduced representation
              bisulfite sequencing, Infinium microarrays and any other protocol
              that produces high-resolution DNA methylation data. Notable
              applications of RnBeads include the analysis of epigenome-wide
              association studies and epigenetic biomarker discovery in cancer
              cohorts.",
  journal  = "Nat. Methods",
  volume   =  11,
  number   =  11,
  pages    = "1138--1140",
  month    =  nov,
  year     =  2014,
  language = "en"
}

@ARTICLE{Maurano2015-jz,
  title    = "Role of {DNA} Methylation in Modulating Transcription Factor
              Occupancy",
  author   = "Maurano, Matthew T and Wang, Hao and John, Sam and Shafer,
              Anthony and Canfield, Theresa and Lee, Kristen and
              Stamatoyannopoulos, John A",
  abstract = "Although DNA methylation is commonly invoked as a mechanism for
              transcriptional repression, the extent to which it actively
              silences transcription factor (TF) occupancy sites in vivo is
              unknown. To study the role of DNA methylation in the active
              modulation of TF binding, we quantified the effect of DNA
              methylation depletion on the genomic occupancy patterns of CTCF,
              an abundant TF with known methylation sensitivity that is capable
              of autonomous binding to its target sites in chromatin. Here, we
              show that the vast majority (>98.5\%) of the tens of thousands of
              unoccupied, methylated CTCF recognition sequences remain unbound
              upon abrogation of DNA methylation. The small fraction of sites
              that show methylation-dependent binding in vivo are in turn
              characterized by highly variable CTCF occupancy across cell
              types. Our results suggest that DNA methylation is not a primary
              groundskeeper of genomic TF landscapes, but rather a specialized
              mechanism for stabilizing intrinsically labile sites.",
  journal  = "Cell Rep.",
  volume   =  12,
  number   =  7,
  pages    = "1184--1195",
  month    =  aug,
  year     =  2015,
  language = "en"
}

@ARTICLE{He2011-pw,
  title    = "Tet-mediated formation of 5-carboxylcytosine and its excision by
              {TDG} in mammalian {DNA}",
  author   = "He, Yu-Fei and Li, Bin-Zhong and Li, Zheng and Liu, Peng and
              Wang, Yang and Tang, Qingyu and Ding, Jianping and Jia, Yingying
              and Chen, Zhangcheng and Li, Lin and Sun, Yan and Li, Xiuxue and
              Dai, Qing and Song, Chun-Xiao and Zhang, Kangling and He, Chuan
              and Xu, Guo-Liang",
  abstract = "The prevalent DNA modification in higher organisms is the
              methylation of cytosine to 5-methylcytosine (5mC), which is
              partially converted to 5-hydroxymethylcytosine (5hmC) by the Tet
              (ten eleven translocation) family of dioxygenases. Despite their
              importance in epigenetic regulation, it is unclear how these
              cytosine modifications are reversed. Here, we demonstrate that
              5mC and 5hmC in DNA are oxidized to 5-carboxylcytosine (5caC) by
              Tet dioxygenases in vitro and in cultured cells. 5caC is
              specifically recognized and excised by thymine-DNA glycosylase
              (TDG). Depletion of TDG in mouse embyronic stem cells leads to
              accumulation of 5caC to a readily detectable level. These data
              suggest that oxidation of 5mC by Tet proteins followed by
              TDG-mediated base excision of 5caC constitutes a pathway for
              active DNA demethylation.",
  journal  = "Science",
  volume   =  333,
  number   =  6047,
  pages    = "1303--1307",
  month    =  sep,
  year     =  2011,
  language = "en"
}

@ARTICLE{Li2011-wc,
  title    = "Tabix: fast retrieval of sequence features from generic
              {TAB-delimited} files",
  author   = "Li, Heng",
  abstract = "UNLABELLED: Tabix is the first generic tool that indexes position
              sorted files in TAB-delimited formats such as GFF, BED, PSL, SAM
              and SQL export, and quickly retrieves features overlapping
              specified regions. Tabix features include few seek function calls
              per query, data compression with gzip compatibility and direct
              FTP/HTTP access. Tabix is implemented as a free command-line tool
              as well as a library in C, Java, Perl and Python. It is
              particularly useful for manually examining local genomic features
              on the command line and enables genome viewers to support huge
              data files and remote custom tracks over networks. AVAILABILITY
              AND IMPLEMENTATION: http://samtools.sourceforge.net.",
  journal  = "Bioinformatics",
  volume   =  27,
  number   =  5,
  pages    = "718--719",
  month    =  mar,
  year     =  2011,
  language = "en"
}

@ARTICLE{Ong2014-rl,
  title    = "{CTCF}: an architectural protein bridging genome topology and
              function",
  author   = "Ong, Chin-Tong and Corces, Victor G",
  abstract = "The eukaryotic genome is organized in the three-dimensional
              nuclear space in a specific manner that is both a cause and a
              consequence of its function. This organization is partly
              established by a special class of architectural proteins, of
              which CCCTC-binding factor (CTCF) is the best characterized.
              Although CTCF has been assigned various roles that are often
              contradictory, new results now help to draw a unifying model to
              explain the many functions of this protein. CTCF creates
              boundaries between topologically associating domains in
              chromosomes and, within these domains, facilitates interactions
              between transcription regulatory sequences. Thus, CTCF links the
              architecture of the genome to its function.",
  journal  = "Nat. Rev. Genet.",
  volume   =  15,
  number   =  4,
  pages    = "234--246",
  month    =  apr,
  year     =  2014,
  language = "en"
}

@ARTICLE{Xu2015-ay,
  title    = "A novel strategy for forensic age prediction by {DNA} methylation
              and support vector regression model",
  author   = "Xu, Cheng and Qu, Hongzhu and Wang, Guangyu and Xie, Bingbing and
              Shi, Yi and Yang, Yaran and Zhao, Zhao and Hu, Lan and Fang,
              Xiangdong and Yan, Jiangwei and Feng, Lei",
  abstract = "High deviations resulting from prediction model, gender and
              population difference have limited age estimation application of
              DNA methylation markers. Here we identified 2,957 novel
              age-associated DNA methylation sites (P 0.5) in blood of eight
              pairs of Chinese Han female monozygotic twins. Among them, nine
              novel sites (false discovery rate < 0.01), along with three other
              reported sites, were further validated in 49 unrelated female
              volunteers with ages of 20-80 years by Sequenom Massarray. A
              total of 95 CpGs were covered in the PCR products and 11 of them
              were built the age prediction models. After comparing four
              different models including, multivariate linear regression,
              multivariate nonlinear regression, back propagation neural
              network and support vector regression, SVR was identified as the
              most robust model with the least mean absolute deviation from
              real chronological age (2.8 years) and an average accuracy of 4.7
              years predicted by only six loci from the 11 loci, as well as an
              less cross-validated error compared with linear regression model.
              Our novel strategy provides an accurate measurement that is
              highly useful in estimating the individual age in forensic
              practice as well as in tracking the aging process in other
              related applications.",
  journal  = "Sci. Rep.",
  volume   =  5,
  pages    = "17788",
  month    =  dec,
  year     =  2015,
  language = "en"
}

@ARTICLE{Klambauer2012-zl,
  title    = "{cn.MOPS}: mixture of Poissons for discovering copy number
              variations in next-generation sequencing data with a low false
              discovery rate",
  author   = "Klambauer, G{\"u}nter and Schwarzbauer, Karin and Mayr, Andreas
              and Clevert, Djork-Arn{\'e} and Mitterecker, Andreas and
              Bodenhofer, Ulrich and Hochreiter, Sepp",
  abstract = "Quantitative analyses of next-generation sequencing (NGS) data,
              such as the detection of copy number variations (CNVs), remain
              challenging. Current methods detect CNVs as changes in the depth
              of coverage along chromosomes. Technological or genomic
              variations in the depth of coverage thus lead to a high false
              discovery rate (FDR), even upon correction for GC content. In the
              context of association studies between CNVs and disease, a high
              FDR means many false CNVs, thereby decreasing the discovery power
              of the study after correction for multiple testing. We propose
              'Copy Number estimation by a Mixture Of PoissonS' (cn.MOPS), a
              data processing pipeline for CNV detection in NGS data. In
              contrast to previous approaches, cn.MOPS incorporates modeling of
              depths of coverage across samples at each genomic position.
              Therefore, cn.MOPS is not affected by read count variations along
              chromosomes. Using a Bayesian approach, cn.MOPS decomposes
              variations in the depth of coverage across samples into integer
              copy numbers and noise by means of its mixture components and
              Poisson distributions, respectively. The noise estimate allows
              for reducing the FDR by filtering out detections having high
              noise that are likely to be false detections. We compared cn.MOPS
              with the five most popular methods for CNV detection in NGS data
              using four benchmark datasets: (i) simulated data, (ii) NGS data
              from a male HapMap individual with implanted CNVs from the X
              chromosome, (iii) data from HapMap individuals with known CNVs,
              (iv) high coverage data from the 1000 Genomes Project. cn.MOPS
              outperformed its five competitors in terms of precision (1-FDR)
              and recall for both gains and losses in all benchmark data sets.
              The software cn.MOPS is publicly available as an R package at
              http://www.bioinf.jku.at/software/cnmops/ and at Bioconductor.",
  journal  = "Nucleic Acids Res.",
  volume   =  40,
  number   =  9,
  pages    = "e69",
  month    =  may,
  year     =  2012,
  language = "en"
}

@ARTICLE{Lister2009-sd,
  title    = "Human {DNA} methylomes at base resolution show widespread
              epigenomic differences",
  author   = "Lister, Ryan and Pelizzola, Mattia and Dowen, Robert H and
              Hawkins, R David and Hon, Gary and Tonti-Filippini, Julian and
              Nery, Joseph R and Lee, Leonard and Ye, Zhen and Ngo, Que-Minh
              and Edsall, Lee and Antosiewicz-Bourget, Jessica and Stewart, Ron
              and Ruotti, Victor and Millar, A Harvey and Thomson, James A and
              Ren, Bing and Ecker, Joseph R",
  abstract = "DNA cytosine methylation is a central epigenetic modification
              that has essential roles in cellular processes including genome
              regulation, development and disease. Here we present the first
              genome-wide, single-base-resolution maps of methylated cytosines
              in a mammalian genome, from both human embryonic stem cells and
              fetal fibroblasts, along with comparative analysis of messenger
              RNA and small RNA components of the transcriptome, several
              histone modifications, and sites of DNA-protein interaction for
              several key regulatory factors. Widespread differences were
              identified in the composition and patterning of cytosine
              methylation between the two genomes. Nearly one-quarter of all
              methylation identified in embryonic stem cells was in a non-CG
              context, suggesting that embryonic stem cells may use different
              methylation mechanisms to affect gene regulation. Methylation in
              non-CG contexts showed enrichment in gene bodies and depletion in
              protein binding sites and enhancers. Non-CG methylation
              disappeared upon induced differentiation of the embryonic stem
              cells, and was restored in induced pluripotent stem cells. We
              identified hundreds of differentially methylated regions proximal
              to genes involved in pluripotency and differentiation, and
              widespread reduced methylation levels in fibroblasts associated
              with lower transcriptional activity. These reference epigenomes
              provide a foundation for future studies exploring this key
              epigenetic modification in human disease and development.",
  journal  = "Nature",
  volume   =  462,
  number   =  7271,
  pages    = "315--322",
  month    =  nov,
  year     =  2009,
  language = "en"
}

@ARTICLE{Sun2014-ri,
  title    = "{MOABS}: model based analysis of bisulfite sequencing data",
  author   = "Sun, Deqiang and Xi, Yuanxin and Rodriguez, Benjamin and Park,
              Hyun Jung and Tong, Pan and Meong, Mira and Goodell, Margaret A
              and Li, Wei",
  abstract = "Bisulfite sequencing (BS-seq) is the gold standard for studying
              genome-wide DNA methylation. We developed MOABS to increase the
              speed, accuracy, statistical power and biological relevance of
              BS-seq data analysis. MOABS detects differential methylation with
              10-fold coverage at single-CpG resolution based on a
              Beta-Binomial hierarchical model and is capable of processing two
              billion reads in 24 CPU hours. Here, using simulated and real
              BS-seq data, we demonstrate that MOABS outperforms other leading
              algorithms, such as Fisher's exact test and BSmooth. Furthermore,
              MOABS analysis can be easily extended to differential 5hmC
              analysis using RRBS and oxBS-seq. MOABS is available at
              http://code.google.com/p/moabs/.",
  journal  = "Genome Biol.",
  volume   =  15,
  number   =  2,
  pages    = "R38",
  month    =  feb,
  year     =  2014,
  language = "en"
}

@ARTICLE{Bock2012-oh,
  title    = "{DNA} methylation dynamics during in vivo differentiation of
              blood and skin stem cells",
  author   = "Bock, Christoph and Beerman, Isabel and Lien, Wen-Hui and Smith,
              Zachary D and Gu, Hongcang and Boyle, Patrick and Gnirke, Andreas
              and Fuchs, Elaine and Rossi, Derrick J and Meissner, Alexander",
  abstract = "DNA methylation is a mechanism of epigenetic regulation that is
              common to all vertebrates. Functional studies underscore its
              relevance for tissue homeostasis, but the global dynamics of DNA
              methylation during in vivo differentiation remain underexplored.
              Here we report high-resolution DNA methylation maps of adult stem
              cell differentiation in mouse, focusing on 19 purified cell
              populations of the blood and skin lineages. DNA methylation
              changes were locus specific and relatively modest in magnitude.
              They frequently overlapped with lineage-associated transcription
              factors and their binding sites, suggesting that DNA methylation
              may protect cells from aberrant transcription factor activation.
              DNA methylation and gene expression provided complementary
              information, and combining the two enabled us to infer the
              cellular differentiation hierarchy of the blood lineage directly
              from genome-scale data. In summary, these results demonstrate
              that in vivo differentiation of adult stem cells is associated
              with small but informative changes in the genomic distribution of
              DNA methylation.",
  journal  = "Mol. Cell",
  volume   =  47,
  number   =  4,
  pages    = "633--647",
  month    =  aug,
  year     =  2012,
  language = "en"
}

@ARTICLE{Genereux2008-rk,
  title    = "Errors in the bisulfite conversion of {DNA}: modulating
              inappropriate- and failed-conversion frequencies",
  author   = "Genereux, Diane P and Johnson, Winslow C and Burden, Alice F and
              St{\"o}ger, Reinhard and Laird, Charles D",
  abstract = "Bisulfite treatment can be used to ascertain the methylation
              states of individual cytosines in DNA. Ideally, bisulfite
              treatment deaminates unmethylated cytosines to uracils, and
              leaves 5-methylcytosines unchanged. Two types of
              bisulfite-conversion error occur: inappropriate conversion of
              5-methylcytosine to thymine, and failure to convert unmethylated
              cytosine to uracil. Conventional bisulfite treatment requires
              hours of exposure to low-molarity, low-temperature bisulfite
              ('LowMT') and, sometimes, thermal denaturation. An alternate,
              high-molarity, high-temperature ('HighMT') protocol has been
              reported to accelerate conversion and to reduce inappropriate
              conversion. We used molecular encoding to obtain validated,
              individual-molecule data on failed- and inappropriate-conversion
              frequencies for LowMT and HighMT treatments of both
              single-stranded and hairpin-linked oligonucleotides. After
              accounting for bisulfite-independent error, we found that: (i)
              inappropriate-conversion events accrue predominantly on molecules
              exposed to bisulfite after they have attained complete or
              near-complete conversion; (ii) the HighMT treatment is preferable
              because it yields greater homogeneity among sites and among
              molecules in conversion rates, and thus yields more reliable
              data; (iii) different durations of bisulfite treatment will yield
              data appropriate to address different experimental questions; and
              (iv) conversion errors can be used to assess the validity of
              methylation data collected without the benefit of molecular
              encoding.",
  journal  = "Nucleic Acids Res.",
  volume   =  36,
  number   =  22,
  pages    = "e150",
  month    =  dec,
  year     =  2008,
  language = "en"
}

@ARTICLE{Saito2015-ra,
  title    = "Detection of differentially methylated regions from bisulfite-seq
              data by hidden Markov models incorporating genome-wide
              methylation level distributions",
  author   = "Saito, Yutaka and Mituyama, Toutai",
  abstract = "BACKGROUND: Detection of differential methylation between
              biological samples is an important task in bisulfite-seq data
              analysis. Several studies have attempted de novo finding of
              differentially methylated regions (DMRs) using hidden Markov
              models (HMMs). However, there is room for improvement in the
              design of HMMs, especially on emission functions that evaluate
              the likelihood of differential methylation at each cytosine site.
              RESULTS: We describe a new HMM for DMR detection from
              bisulfite-seq data. Our method utilizes emission functions that
              combine binomial models for aligned read counts, and beta
              mixtures for incorporating genome-wide methylation level
              distributions. We also develop unsupervised learning algorithms
              to adjust parameters of the beta-binomial models depending on
              differential methylation types (up, down, and not changed). In
              experiments on both simulated and real datasets, the new HMM
              improves DMR detection accuracy compared with HMMs in our
              previous study. Furthermore, our method achieves better accuracy
              than other methods using Fisher's exact test and methylation
              level smoothing. CONCLUSIONS: Our method enables accurate DMR
              detection from bisulfite-seq data. The implementation of our
              method is named ComMet, and distributed as a part of Bisulfighter
              package, which is available at
              http://epigenome.cbrc.jp/bisulfighter.",
  journal  = "BMC Genomics",
  volume   = "16 Suppl 12",
  pages    = "S3",
  month    =  dec,
  year     =  2015,
  language = "en"
}

@ARTICLE{Lovkvist2016-ky,
  title    = "{DNA} methylation in human epigenomes depends on local topology
              of {CpG} sites",
  author   = "L{\"o}vkvist, Cecilia and Dodd, Ian B and Sneppen, Kim and
              Haerter, Jan O",
  abstract = "In vertebrates, methylation of cytosine at CpG sequences is
              implicated in stable and heritable patterns of gene expression.
              The classical model for inheritance, in which individual CpG
              sites are independent, provides no explanation for the observed
              non-random patterns of methylation. We first investigate the
              exact topology of CpG clustering in the human genome associated
              to CpG islands. Then, by pooling genomic CpG clusters on the
              basis of short distances between CpGs within and long distances
              outside clusters, we show a strong dependence of methylation on
              the number and density of CpG organization. CpG clusters with
              fewer, or less densely spaced, CpGs are predominantly
              hyper-methylated, while larger clusters are predominantly
              hypo-methylated. Intermediate clusters, however, are either
              hyper- or hypo-methylated but are rarely found in intermediate
              methylation states. We develop a model for spatially-dependent
              collaboration between CpGs, where methylated CpGs recruit
              methylation enzymes that can act on CpGs over an extended local
              region, while unmethylated CpGs recruit demethylation enzymes
              that act more strongly on nearby CpGs. This model can reproduce
              the effects of CpG clustering on methylation and produces stable
              and heritable alternative methylation states of CpG clusters,
              thus providing a coherent model for methylation inheritance and
              methylation patterning.",
  journal  = "Nucleic Acids Res.",
  volume   =  44,
  number   =  11,
  pages    = "5123--5132",
  month    =  jun,
  year     =  2016,
  language = "en"
}

@ARTICLE{Bird2002-ae,
  title   = "{DNA} methylation patterns and epigenetic memory",
  author  = "Bird, A",
  journal = "Genes Dev.",
  volume  =  16,
  number  =  1,
  pages   = "6--21",
  year    =  2002
}

@ARTICLE{Stadler2011-iu,
  title    = "{DNA-binding} factors shape the mouse methylome at distal
              regulatory regions",
  author   = "Stadler, Michael B and Murr, Rabih and Burger, Lukas and Ivanek,
              Robert and Lienert, Florian and Sch{\"o}ler, Anne and van
              Nimwegen, Erik and Wirbelauer, Christiane and Oakeley, Edward J
              and Gaidatzis, Dimos and Tiwari, Vijay K and Sch{\"u}beler, Dirk",
  abstract = "Methylation of cytosines is an essential epigenetic modification
              in mammalian genomes, yet the rules that govern methylation
              patterns remain largely elusive. To gain insights into this
              process, we generated base-pair-resolution mouse methylomes in
              stem cells and neuronal progenitors. Advanced quantitative
              analysis identified low-methylated regions (LMRs) with an average
              methylation of 30\%. These represent CpG-poor distal regulatory
              regions as evidenced by location, DNase I hypersensitivity,
              presence of enhancer chromatin marks and enhancer activity in
              reporter assays. LMRs are occupied by DNA-binding factors and
              their binding is necessary and sufficient to create LMRs. A
              comparison of neuronal and stem-cell methylomes confirms this
              dependency, as cell-type-specific LMRs are occupied by
              cell-type-specific transcription factors. This study provides
              methylome references for the mouse and shows that DNA-binding
              factors locally influence DNA methylation, enabling the
              identification of active regulatory regions.",
  journal  = "Nature",
  volume   =  480,
  number   =  7378,
  pages    = "490--495",
  month    =  dec,
  year     =  2011,
  language = "en"
}

@ARTICLE{Burger2013-rn,
  title    = "Identification of active regulatory regions from {DNA}
              methylation data",
  author   = "Burger, Lukas and Gaidatzis, Dimos and Sch{\"u}beler, Dirk and
              Stadler, Michael B",
  abstract = "We have recently shown that transcription factor binding leads to
              defined reduction in DNA methylation, allowing for the
              identification of active regulatory regions from high-resolution
              methylomes. Here, we present MethylSeekR, a computational tool to
              accurately identify such footprints from bisulfite-sequencing
              data. Applying our method to a large number of published human
              methylomes, we demonstrate its broad applicability and generalize
              our previous findings from a neuronal differentiation system to
              many cell types and tissues. MethylSeekR is available as an R
              package at www.bioconductor.org.",
  journal  = "Nucleic Acids Res.",
  volume   =  41,
  number   =  16,
  pages    = "e155",
  month    =  sep,
  year     =  2013,
  language = "en"
}

@ARTICLE{Brinkman2010-se,
  title    = "Whole-genome {DNA} methylation profiling using {MethylCap-seq}",
  author   = "Brinkman, Arie B and Simmer, Femke and Ma, Kelong and Kaan, Anita
              and Zhu, Jingde and Stunnenberg, Hendrik G",
  abstract = "MethylCap-seq is a robust procedure for genome-wide profiling of
              DNA methylation. The approach consists of the capture of
              methylated DNA using the MBD domain of MeCP2, and subsequent
              next-generation sequencing of eluted DNA. Elution of the captured
              methylated DNA is done in steps using a salt gradient, which
              stratifies the genome into fractions with different CpG density.
              The enrichment reached within the individual eluates allows for
              cost-effective deep sequence coverage. The profiles together
              yield a detailed genome-wide map of methylated regions and
              readily allows detection of DNA methylation in known and novel
              regions. Here, we describe principles and details of the
              MethylCap-seq procedure using different sources of starting
              material.",
  journal  = "Methods",
  volume   =  52,
  number   =  3,
  pages    = "232--236",
  month    =  nov,
  year     =  2010,
  language = "en"
}

@ARTICLE{Hansen2012-kw,
  title   = "{BSmooth}: from whole genome bisulfite sequencing reads to
             differentially methylated regions",
  author  = "Hansen, Kasper D and Langmead, Benjamin and Irizarry, Rafael A",
  journal = "Genome Biol.",
  volume  =  13,
  number  =  10,
  pages   = "R83",
  year    =  2012
}

@ARTICLE{Kunde-Ramamoorthy2014-fg,
  title    = "Comparison and quantitative verification of mapping algorithms
              for whole-genome bisulfite sequencing",
  author   = "Kunde-Ramamoorthy, Govindarajan and Coarfa, Cristian and
              Laritsky, Eleonora and Kessler, Noah J and Harris, R Alan and Xu,
              Mingchu and Chen, Rui and Shen, Lanlan and Milosavljevic,
              Aleksandar and Waterland, Robert A",
  abstract = "Coupling bisulfite conversion with next-generation sequencing
              (Bisulfite-seq) enables genome-wide measurement of DNA
              methylation, but poses unique challenges for mapping. However,
              despite a proliferation of Bisulfite-seq mapping tools, no
              systematic comparison of their genomic coverage and quantitative
              accuracy has been reported. We sequenced bisulfite-converted DNA
              from two tissues from each of two healthy human adults and
              systematically compared five widely used Bisulfite-seq mapping
              algorithms: Bismark, BSMAP, Pash, BatMeth and BS Seeker. We
              evaluated their computational speed and genomic coverage and
              verified their percentage methylation estimates. With the
              exception of BatMeth, all mappers covered >70\% of CpG sites
              genome-wide and yielded highly concordant estimates of percentage
              methylation (r(2) $\geq$ 0.95). Fourfold variation in mapping
              time was found between BSMAP (fastest) and Pash (slowest). In
              each library, 8-12\% of genomic regions covered by Bismark and
              Pash were not covered by BSMAP. An experiment using simulated
              reads confirmed that Pash has an exceptional ability to uniquely
              map reads in genomic regions of structural variation. Independent
              verification by bisulfite pyrosequencing generally confirmed the
              percentage methylation estimates by the mappers. Of these
              algorithms, Bismark provides an attractive combination of
              processing speed, genomic coverage and quantitative accuracy,
              whereas Pash offers considerably higher genomic coverage.",
  journal  = "Nucleic Acids Res.",
  volume   =  42,
  number   =  6,
  pages    = "e43",
  month    =  apr,
  year     =  2014,
  language = "en"
}

@ARTICLE{Booth2014-tk,
  title   = "Quantitative sequencing of 5-formylcytosine in {DNA} at
             single-base resolution",
  author  = "Booth, Michael J and Marsico, Giovanni and Bachman, Martin and
             Beraldi, Dario and Balasubramanian, Shankar",
  journal = "Nat. Chem.",
  volume  =  6,
  number  =  5,
  pages   = "435--440",
  year    =  2014
}

@ARTICLE{Kurukuti2006-ef,
  title    = "{CTCF} binding at the {H19} imprinting control region mediates
              maternally inherited higher-order chromatin conformation to
              restrict enhancer access to Igf2",
  author   = "Kurukuti, Sreenivasulu and Tiwari, Vijay Kumar and Tavoosidana,
              Gholamreza and Pugacheva, Elena and Murrell, Adele and Zhao,
              Zhihu and Lobanenkov, Victor and Reik, Wolf and Ohlsson, Rolf",
  abstract = "It is thought that the H19 imprinting control region (ICR)
              directs the silencing of the maternally inherited Igf2 allele
              through a CTCF-dependent chromatin insulator. The ICR has been
              shown to interact physically with a silencer region in Igf2,
              differentially methylated region (DMR)1, but the role of CTCF in
              this chromatin loop and whether it restricts the physical access
              of distal enhancers to Igf2 is not known. We performed systematic
              chromosome conformation capture analyses in the Igf2/H19 region
              over >160 kb, identifying sequences that interact physically with
              the distal enhancers and the ICR. We found that, on the paternal
              chromosome, enhancers interact with the Igf2 promoters but that,
              on the maternal allele, this is prevented by CTCF binding within
              the H19 ICR. CTCF binding in the maternal ICR regulates its
              interaction with matrix attachment region (MAR)3 and DMR1 at
              Igf2, thus forming a tight loop around the maternal Igf2 locus,
              which may contribute to its silencing. Mutation of CTCF binding
              sites in the H19 ICR leads to loss of CTCF binding and de novo
              methylation of a CTCF target site within Igf2 DMR1, showing that
              CTCF can coordinate regional epigenetic marks. This systematic
              chromosome conformation capture analysis of an imprinting cluster
              reveals that CTCF has a critical role in the epigenetic
              regulation of higher-order chromatin structure and gene silencing
              over considerable distances in the genome.",
  journal  = "Proc. Natl. Acad. Sci. U. S. A.",
  volume   =  103,
  number   =  28,
  pages    = "10684--10689",
  month    =  jul,
  year     =  2006,
  language = "en"
}

@ARTICLE{Wang2012-zn,
  title    = "Widespread plasticity in {CTCF} occupancy linked to {DNA}
              methylation",
  author   = "Wang, Hao and Maurano, Matthew T and Qu, Hongzhu and Varley,
              Katherine E and Gertz, Jason and Pauli, Florencia and Lee,
              Kristen and Canfield, Theresa and Weaver, Molly and Sandstrom,
              Richard and Thurman, Robert E and Kaul, Rajinder and Myers,
              Richard M and Stamatoyannopoulos, John A",
  abstract = "CTCF is a ubiquitously expressed regulator of fundamental genomic
              processes including transcription, intra- and interchromosomal
              interactions, and chromatin structure. Because of its critical
              role in genome function, CTCF binding patterns have long been
              assumed to be largely invariant across different cellular
              environments. Here we analyze genome-wide occupancy patterns of
              CTCF by ChIP-seq in 19 diverse human cell types, including normal
              primary cells and immortal lines. We observed highly reproducible
              yet surprisingly plastic genomic binding landscapes, indicative
              of strong cell-selective regulation of CTCF occupancy. Comparison
              with massively parallel bisulfite sequencing data indicates that
              41\% of variable CTCF binding is linked to differential DNA
              methylation, concentrated at two critical positions within the
              CTCF recognition sequence. Unexpectedly, CTCF binding patterns
              were markedly different in normal versus immortal cells, with the
              latter showing widespread disruption of CTCF binding associated
              with increased methylation. Strikingly, this disruption is
              accompanied by up-regulation of CTCF expression, with the result
              that both normal and immortal cells maintain the same average
              number of CTCF occupancy sites genome-wide. These results reveal
              a tight linkage between DNA methylation and the global occupancy
              patterns of a major sequence-specific regulatory factor.",
  journal  = "Genome Res.",
  volume   =  22,
  number   =  9,
  pages    = "1680--1688",
  month    =  sep,
  year     =  2012,
  language = "en"
}

@ARTICLE{Adler2014-nn,
  title   = "ff: Memory-efficient Storage of Large Data on Disk and Fast Access
             Functions",
  author  = "Adler, D and Gl{\"a}ser, C and Nenadic, O and Oehlschl{\"a}gel, J
             and Zucchini, W",
  journal = "R package version",
  pages   = "2--2",
  year    =  2014
}

@ARTICLE{Adams2012-nh,
  title    = "{BLUEPRINT} to decode the epigenetic signature written in blood",
  author   = "Adams, David and Altucci, Lucia and Antonarakis, Stylianos E and
              Ballesteros, Juan and Beck, Stephan and Bird, Adrian and Bock,
              Christoph and Boehm, Bernhard and Campo, Elias and Caricasole,
              Andrea and Dahl, Fredrik and Dermitzakis, Emmanouil T and Enver,
              Tariq and Esteller, Manel and Estivill, Xavier and
              Ferguson-Smith, Anne and Fitzgibbon, Jude and Flicek, Paul and
              Giehl, Claudia and Graf, Thomas and Grosveld, Frank and Guigo,
              Roderic and Gut, Ivo and Helin, Kristian and Jarvius, Jonas and
              K{\"u}ppers, Ralf and Lehrach, Hans and Lengauer, Thomas and
              Lernmark, {\AA}ke and Leslie, David and Loeffler, Markus and
              Macintyre, Elizabeth and Mai, Antonello and Martens, Joost H A
              and Minucci, Saverio and Ouwehand, Willem H and Pelicci, Pier
              Giuseppe and Pendeville, H{\`e}l{\'e}ne and Porse, Bo and Rakyan,
              Vardhman and Reik, Wolf and Schrappe, Martin and Sch{\"u}beler,
              Dirk and Seifert, Martin and Siebert, Reiner and Simmons, David
              and Soranzo, Nicole and Spicuglia, Salvatore and Stratton,
              Michael and Stunnenberg, Hendrik G and Tanay, Amos and Torrents,
              David and Valencia, Alfonso and Vellenga, Edo and Vingron, Martin
              and Walter, J{\"o}rn and Willcocks, Spike",
  journal  = "Nat. Biotechnol.",
  volume   =  30,
  number   =  3,
  pages    = "224--226",
  month    =  mar,
  year     =  2012,
  language = "en"
}

@ARTICLE{Heyn2012-kl,
  title    = "Distinct {DNA} methylomes of newborns and centenarians",
  author   = "Heyn, Holger and Li, Ning and Ferreira, Humberto J and Moran,
              Sebastian and Pisano, David G and Gomez, Antonio and Diez, Javier
              and Sanchez-Mut, Jose V and Setien, Fernando and Carmona, F
              Javier and Puca, Annibale A and Sayols, Sergi and Pujana, Miguel
              A and Serra-Musach, Jordi and Iglesias-Platas, Isabel and
              Formiga, Francesc and Fernandez, Agustin F and Fraga, Mario F and
              Heath, Simon C and Valencia, Alfonso and Gut, Ivo G and Wang, Jun
              and Esteller, Manel",
  abstract = "Human aging cannot be fully understood in terms of the
              constrained genetic setting. Epigenetic drift is an alternative
              means of explaining age-associated alterations. To address this
              issue, we performed whole-genome bisulfite sequencing (WGBS) of
              newborn and centenarian genomes. The centenarian DNA had a lower
              DNA methylation content and a reduced correlation in the
              methylation status of neighboring cytosine--phosphate--guanine
              (CpGs) throughout the genome in comparison with the more
              homogeneously methylated newborn DNA. The more hypomethylated
              CpGs observed in the centenarian DNA compared with the neonate
              covered all genomic compartments, such as promoters, exonic,
              intronic, and intergenic regions. For regulatory regions, the
              most hypomethylated sequences in the centenarian DNA were present
              mainly at CpG-poor promoters and in tissue-specific genes,
              whereas a greater level of DNA methylation was observed in CpG
              island promoters. We extended the study to a larger cohort of
              newborn and nonagenarian samples using a 450,000 CpG-site DNA
              methylation microarray that reinforced the observation of more
              hypomethylated DNA sequences in the advanced age group. WGBS and
              450,000 analyses of middle-age individuals demonstrated DNA
              methylomes in the crossroad between the newborn and the
              nonagenarian/centenarian groups. Our study constitutes a unique
              DNA methylation analysis of the extreme points of human life at a
              single-nucleotide resolution level.",
  journal  = "Proc. Natl. Acad. Sci. U. S. A.",
  volume   =  109,
  number   =  26,
  pages    = "10522--10527",
  month    =  jun,
  year     =  2012,
  language = "en"
}

@ARTICLE{Assenov2014-fm,
  title   = "Comprehensive analysis of {DNA} methylation data with {RnBeads}",
  author  = "Assenov, Yassen and M{\"u}ller, Fabian and Lutsik, Pavlo and
             Walter, J{\"o}rn and Lengauer, Thomas and Bock, Christoph",
  journal = "Nat. Methods",
  volume  =  11,
  number  =  11,
  pages   = "1138--1140",
  year    =  2014
}

@ARTICLE{Krueger2011-vv,
  title    = "Bismark: a flexible aligner and methylation caller for
              {Bisulfite-Seq} applications",
  author   = "Krueger, Felix and Andrews, Simon R",
  abstract = "SUMMARY: A combination of bisulfite treatment of DNA and
              high-throughput sequencing (BS-Seq) can capture a snapshot of a
              cell's epigenomic state by revealing its genome-wide cytosine
              methylation at single base resolution. Bismark is a flexible tool
              for the time-efficient analysis of BS-Seq data which performs
              both read mapping and methylation calling in a single convenient
              step. Its output discriminates between cytosines in CpG, CHG and
              CHH context and enables bench scientists to visualize and
              interpret their methylation data soon after the sequencing run is
              completed. AVAILABILITY AND IMPLEMENTATION: Bismark is released
              under the GNU GPLv3+ licence. The source code is freely available
              from www.bioinformatics.bbsrc.ac.uk/projects/bismark/.",
  journal  = "Bioinformatics",
  volume   =  27,
  number   =  11,
  pages    = "1571--1572",
  month    =  jun,
  year     =  2011,
  language = "en"
}

@ARTICLE{Stadler2011-yv,
  title    = "{DNA-binding} factors shape the mouse methylome at distal
              regulatory regions",
  author   = "Stadler, Michael B and Murr, Rabih and Burger, Lukas and Ivanek,
              Robert and Lienert, Florian and Sch{\"o}ler, Anne and van
              Nimwegen, Erik and Wirbelauer, Christiane and Oakeley, Edward J
              and Gaidatzis, Dimos and Tiwari, Vijay K and Sch{\"u}beler, Dirk",
  abstract = "Methylation of cytosines is an essential epigenetic modification
              in mammalian genomes, yet the rules that govern methylation
              patterns remain largely elusive. To gain insights into this
              process, we generated base-pair-resolution mouse methylomes in
              stem cells and neuronal progenitors. Advanced quantitative
              analysis identified low-methylated regions (LMRs) with an average
              methylation of 30\%. These represent CpG-poor distal regulatory
              regions as evidenced by location, DNase I hypersensitivity,
              presence of enhancer chromatin marks and enhancer activity in
              reporter assays. LMRs are occupied by DNA-binding factors and
              their binding is necessary and sufficient to create LMRs. A
              comparison of neuronal and stem-cell methylomes confirms this
              dependency, as cell-type-specific LMRs are occupied by
              cell-type-specific transcription factors. This study provides
              methylome references for the mouse and shows that DNA-binding
              factors locally influence DNA methylation, enabling the
              identification of active regulatory regions.",
  journal  = "Nature",
  volume   =  480,
  number   =  7378,
  pages    = "490--495",
  month    =  dec,
  year     =  2011,
  language = "en"
}

@ARTICLE{Deaton2011-pm,
  title   = "{CpG} islands and the regulation of transcription",
  author  = "Deaton, A M and Bird, A",
  journal = "Genes Dev.",
  volume  =  25,
  number  =  10,
  pages   = "1010--1022",
  year    =  2011
}

@ARTICLE{McRae2014-gf,
  title    = "Contribution of genetic variation to transgenerational
              inheritance of {DNA} methylation",
  author   = "McRae, Allan F and Powell, Joseph E and Henders, Anjali K and
              Bowdler, Lisa and Hemani, Gibran and Shah, Sonia and Painter,
              Jodie N and Martin, Nicholas G and Visscher, Peter M and
              Montgomery, Grant W",
  abstract = "BACKGROUND: Despite the important role DNA methylation plays in
              transcriptional regulation, the transgenerational inheritance of
              DNA methylation is not well understood. The genetic heritability
              of DNA methylation has been estimated using twin pairs, although
              concern has been expressed whether the underlying assumption of
              equal common environmental effects are applicable due to
              intrauterine differences between monozygotic and dizygotic twins.
              We estimate the heritability of DNA methylation on peripheral
              blood leukocytes using Illumina HumanMethylation450 array using a
              family based sample of 614 people from 117 families, allowing
              comparison both within and across generations. RESULTS: The
              correlations from the various available relative pairs indicate
              that on average the similarity in DNA methylation between
              relatives is predominantly due to genetic effects with any common
              environmental or zygotic effects being limited. The average
              heritability of DNA methylation measured at probes with no known
              SNPs is estimated as 0.187. The ten most heritable methylation
              probes were investigated with a genome-wide association study,
              all showing highly statistically significant cis mQTLs. Further
              investigation of one of these cis mQTL, found in the MHC region
              of chromosome 6, showed the most significantly associated SNP was
              also associated with over 200 other DNA methylation probes in
              this region and the gene expression level of 9 genes.
              CONCLUSIONS: The majority of transgenerational similarity in DNA
              methylation is attributable to genetic effects, and approximately
              20\% of individual differences in DNA methylation in the
              population are caused by DNA sequence variation that is not
              located within CpG sites.",
  journal  = "Genome Biol.",
  volume   =  15,
  number   =  5,
  pages    = "R73",
  month    =  may,
  year     =  2014,
  language = "en"
}



@article{li2009fast,
  title={Fast and accurate short read alignment with Burrows--Wheeler transform},
  author={Li, Heng and Durbin, Richard},
  journal={bioinformatics},
  volume={25},
  number={14},
  pages={1754--1760},
  year={2009},
  publisher={Oxford University Press}
}
@article{li2009soap2,
  title={SOAP2: an improved ultrafast tool for short read alignment},
  author={Li, Ruiqiang and Yu, Chang and Li, Yingrui and Lam, Tak-Wah and Yiu, Siu-Ming and Kristiansen, Karsten and Wang, Jun},
  journal={Bioinformatics},
  volume={25},
  number={15},
  pages={1966--1967},
  year={2009},
  publisher={Oxford University Press}
}
@article{langmead2012fast,
  title={Fast gapped-read alignment with Bowtie 2},
  author={Langmead, Ben and Salzberg, Steven L},
  journal={Nature methods},
  volume={9},
  number={4},
  pages={357},
  year={2012},
  publisher={Nature Publishing Group}
}

@Book{xie2015,
  title = {Dynamic Documents with {R} and knitr},
  author = {Yihui Xie},
  publisher = {Chapman and Hall/CRC},
  address = {Boca Raton, Florida},
  year = {2015},
  edition = {2nd},
  note = {ISBN 978-1498716963},
  url = {http://yihui.name/knitr/},
}
@article{knuth1984,
  title={Literate programming},
  author={Knuth, Donald E.},
  journal={The Computer Journal},
  volume={27},
  number={2},
  pages={97--111},
  year={1984},
  publisher={British Computer Society}
}

@Manual{gmapR,
    title = {gmapR: An R interface to the GMAP/GSNAP/GSTRUCT suite},
    author = {Cory Barr and Thomas Wu and Michael Lawrence},
    year = {2019},
    note = {R package version 1.24.2},
  }

@Article{Rqc,
    title = {{Rqc}: A {Bioconductor} Package for Quality Control of High-Throughput Sequencing Data},
    author = {Welliton {de Souza} and Benilton S Carvalho and Iscia Lopes-Cendes},
    journal = {Journal of Statistical Software, Code Snippets},
    year = {2018},
    volume = {87},
    number = {2},
    pages = {1--14},
    doi = {10.18637/jss.v087.c02},
  }

@Manual{pheatmap,
    title = {pheatmap: Pretty Heatmaps},
    author = {Raivo Kolde},
    year = {2019},
    note = {R package version 1.0.12},
    url = {https://CRAN.R-project.org/package=pheatmap},
  }

@article{conesa_survey_2016,
	title = {A survey of best practices for {RNA}-seq data analysis},
	volume = {17},
	issn = {1474-760X},
	url = {https://doi.org/10.1186/s13059-016-0881-8},
	doi = {10.1186/s13059-016-0881-8},
	abstract = {RNA-sequencing (RNA-seq) has a wide variety of applications, but no single analysis pipeline can be used in all cases. We review all of the major steps in RNA-seq data analysis, including experimental design, quality control, read alignment, quantification of gene and transcript levels, visualization, differential gene expression, alternative splicing, functional analysis, gene fusion detection and eQTL mapping. We highlight the challenges associated with each step. We discuss the analysis of small RNAs and the integration of RNA-seq with other functional genomics techniques. Finally, we discuss the outlook for novel technologies that are changing the state of the art in transcriptomics.},
	urldate = {2018-05-23},
	journal = {Genome Biology},
	author = {Conesa, Ana and Madrigal, Pedro and Tarazona, Sonia and Gomez-Cabrero, David and Cervera, Alejandra and McPherson, Andrew and Szcześniak, Michał Wojciech and Gaffney, Daniel J. and Elo, Laura L. and Zhang, Xuegong and Mortazavi, Ali},
	month = jan,
	year = {2016},
	keywords = {Differential Expression Analysis, Gene Transfer Format, Reference Transcriptome, Transcript Discovery, Transcript Identification},
	pages = {13},
	annote = {Pages 13 in PDF},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/XJENQ8TS/Conesa et al. - 2016 - A survey of best practices for RNA-seq data analys.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/TJGBENHN/s13059-016-0881-8.html:text/html}
}

@article{wang_rna-seq:_2009,
	title = {{RNA}-{Seq}: a revolutionary tool for transcriptomics},
	volume = {10},
	copyright = {2009 Nature Publishing Group},
	issn = {1471-0064},
	shorttitle = {{RNA}-{Seq}},
	url = {https://www.nature.com/articles/nrg2484},
	doi = {10.1038/nrg2484},
	abstract = {RNA-Seq is a recently developed approach to transcriptome profiling that uses deep-sequencing technologies. Studies using this method have already altered our view of the extent and complexity of eukaryotic transcriptomes. RNA-Seq also provides a far more precise measurement of levels of transcripts and their isoforms than other methods. This article describes the RNA-Seq approach, the challenges associated with its application, and the advances made so far in characterizing several eukaryote transcriptomes.},
	language = {en},
	number = {1},
	urldate = {2018-05-23},
	journal = {Nature Reviews Genetics},
	author = {Wang, Zhong and Gerstein, Mark and Snyder, Michael},
	month = jan,
	year = {2009},
	pages = {57--63},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/MAI9MGZN/Wang et al. - 2009 - RNA-Seq a revolutionary tool for transcriptomics.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/5ENM5XQS/nrg2484.html:text/html}
}

@article{trapnell_transcript_2010,
	title = {Transcript assembly and quantification by {RNA}-{Seq} reveals unannotated transcripts and isoform switching during cell differentiation},
	volume = {28},
	copyright = {2010 Nature Publishing Group},
	issn = {1546-1696},
	url = {https://www.nature.com/articles/nbt.1621},
	doi = {10.1038/nbt.1621},
	abstract = {High-throughput mRNA sequencing (RNA-Seq) promises simultaneous transcript discovery and abundance estimation1,2,3. However, this would require algorithms that are not restricted by prior gene annotations and that account for alternative transcription and splicing. Here we introduce such algorithms in an open-source software program called Cufflinks. To test Cufflinks, we sequenced and analyzed {\textgreater}430 million paired 75-bp RNA-Seq reads from a mouse myoblast cell line over a differentiation time series. We detected 13,692 known transcripts and 3,724 previously unannotated ones, 62\% of which are supported by independent expression data or by homologous genes in other species. Over the time series, 330 genes showed complete switches in the dominant transcription start site (TSS) or splice isoform, and we observed more subtle shifts in 1,304 other genes. These results suggest that Cufflinks can illuminate the substantial regulatory flexibility and complexity in even this well-studied model of muscle development and that it can improve transcriptome-based genome annotation.},
	language = {en},
	number = {5},
	urldate = {2018-05-23},
	journal = {Nature Biotechnology},
	author = {Trapnell, Cole and Williams, Brian A. and Pertea, Geo and Mortazavi, Ali and Kwan, Gordon and Baren, Marijke J. van and Salzberg, Steven L. and Wold, Barbara J. and Pachter, Lior},
	month = may,
	year = {2010},
	pages = {511--515},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/4RFA6GV7/Trapnell et al. - 2010 - Transcript assembly and quantification by RNA-Seq .pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/56K2445T/nbt.html:text/html}
}

@article{trapnell_differential_2012,
	title = {Differential gene and transcript expression analysis of {RNA}-seq experiments with {TopHat} and {Cufflinks}},
	volume = {7},
	copyright = {2012 Nature Publishing Group},
	issn = {1750-2799},
	url = {https://www.nature.com/articles/nprot.2012.016},
	doi = {10.1038/nprot.2012.016},
	abstract = {Recent advances in high-throughput cDNA sequencing (RNA-seq) can reveal new genes and splice variants and quantify expression genome-wide in a single assay. The volume and complexity of data from RNA-seq experiments necessitate scalable, fast and mathematically principled analysis software. TopHat and Cufflinks are free, open-source software tools for gene discovery and comprehensive expression analysis of high-throughput mRNA sequencing (RNA-seq) data. Together, they allow biologists to identify new genes and new splice variants of known ones, as well as compare gene and transcript expression under two or more conditions. This protocol describes in detail how to use TopHat and Cufflinks to perform such analyses. It also covers several accessory tools and utilities that aid in managing data, including CummeRbund, a tool for visualizing RNA-seq analysis results. Although the procedure assumes basic informatics skills, these tools assume little to no background with RNA-seq analysis and are meant for novices and experts alike. The protocol begins with raw sequencing reads and produces a transcriptome assembly, lists of differentially expressed and regulated genes and transcripts, and publication-quality visualizations of analysis results. The protocol's execution time depends on the volume of transcriptome sequencing data and available computing resources but takes less than 1 d of computer time for typical experiments and ∼1 h of hands-on time.},
	language = {en},
	number = {3},
	urldate = {2018-05-23},
	journal = {Nature Protocols},
	author = {Trapnell, Cole and Roberts, Adam and Goff, Loyal and Pertea, Geo and Kim, Daehwan and Kelley, David R. and Pimentel, Harold and Salzberg, Steven L. and Rinn, John L. and Pachter, Lior},
	month = mar,
	year = {2012},
	pages = {562--578},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/F936GUHM/Trapnell et al. - 2012 - Differential gene and transcript expression analys.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/2PR5NYCA/nprot.2012.html:text/html}
}

@article{maza_comparison_2013,
	title = {Comparison of normalization methods for differential gene expression analysis in {RNA}-{Seq} experiments: {A} matter of relative size of studied transcriptomes},
	volume = {6},
	issn = {1942-0889},
	shorttitle = {Comparison of normalization methods for differential gene expression analysis in {RNA}-{Seq} experiments},
	url = {http://www.tandfonline.com/doi/abs/10.4161/cib.25849},
	doi = {10.4161/cib.25849},
	language = {en},
	number = {6},
	urldate = {2018-05-31},
	journal = {Communicative \& Integrative Biology},
	author = {Maza, Elie and Frasse, Pierre and Senin, Pavel and Bouzayen, Mondher and Zouine, Mohamed},
	month = nov,
	year = {2013},
	pages = {e25849},
	file = {Maza et al. - 2013 - Comparison of normalization methods for differenti.pdf:/Users/buyar/Documents/zotero_library/storage/BGVNVDD5/Maza et al. - 2013 - Comparison of normalization methods for differenti.pdf:application/pdf}
}

@article{soneson_comparison_2013,
	title = {A comparison of methods for differential expression analysis of {RNA}-seq data},
	volume = {14},
	issn = {1471-2105},
	url = {https://doi.org/10.1186/1471-2105-14-91},
	doi = {10.1186/1471-2105-14-91},
	abstract = {Finding genes that are differentially expressed between conditions is an integral part of understanding the molecular basis of phenotypic variation. In the past decades, DNA microarrays have been used extensively to quantify the abundance of mRNA corresponding to different genes, and more recently high-throughput sequencing of cDNA (RNA-seq) has emerged as a powerful competitor. As the cost of sequencing decreases, it is conceivable that the use of RNA-seq for differential expression analysis will increase rapidly. To exploit the possibilities and address the challenges posed by this relatively new type of data, a number of software packages have been developed especially for differential expression analysis of RNA-seq data.},
	urldate = {2018-05-31},
	journal = {BMC Bioinformatics},
	author = {Soneson, Charlotte and Delorenzi, Mauro},
	month = mar,
	year = {2013},
	keywords = {Gene expression, Differential expression, RNA-seq},
	pages = {91},
	annote = {Pages 91 in PDF},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/IJREPMRK/Soneson and Delorenzi - 2013 - A comparison of methods for differential expressio.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/LBLE8SB9/1471-2105-14-91.html:text/html}
}

@article{risso_normalization_2014,
	title = {Normalization of {RNA}-seq data using factor analysis of control genes or samples},
	volume = {32},
	copyright = {2014 Nature Publishing Group},
	issn = {1546-1696},
	url = {https://www.nature.com/articles/nbt.2931},
	doi = {10.1038/nbt.2931},
	abstract = {Normalization of RNA-sequencing (RNA-seq) data has proven essential to ensure accurate inference of expression levels. Here, we show that usual normalization approaches mostly account for sequencing depth and fail to correct for library preparation and other more complex unwanted technical effects. We evaluate the performance of the External RNA Control Consortium (ERCC) spike-in controls and investigate the possibility of using them directly for normalization. We show that the spike-ins are not reliable enough to be used in standard global-scaling or regression-based normalization procedures. We propose a normalization strategy, called remove unwanted variation (RUV), that adjusts for nuisance technical effects by performing factor analysis on suitable sets of control genes (e.g., ERCC spike-ins) or samples (e.g., replicate libraries). Our approach leads to more accurate estimates of expression fold-changes and tests of differential expression compared to state-of-the-art normalization methods. In particular, RUV promises to be valuable for large collaborative projects involving multiple laboratories, technicians, and/or sequencing platforms.},
	language = {en},
	number = {9},
	urldate = {2018-07-04},
	journal = {Nature Biotechnology},
	author = {Risso, Davide and Ngai, John and Speed, Terence P. and Dudoit, Sandrine},
	month = sep,
	year = {2014},
	pages = {896--902},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/D9VE6VBB/Risso et al. - 2014 - Normalization of RNA-seq data using factor analysi.pdf:application/pdf}
}

@article{love_moderated_2014,
	title = {Moderated estimation of fold change and dispersion for {RNA}-seq data with {DESeq}2},
	volume = {15},
	issn = {1474-760X},
	url = {http://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8},
	doi = {10.1186/s13059-014-0550-8},
	abstract = {In comparative high-throughput sequencing assays, a fundamental task is the analysis of count data, such as read counts per gene in RNA-seq, for evidence of systematic changes across experimental conditions. Small replicate numbers, discreteness, large dynamic range and the presence of outliers require a suitable statistical approach. We present DESeq2, a method for differential analysis of count data, using shrinkage estimation for dispersions and fold changes to improve stability and interpretability of estimates. This enables a more quantitative analysis focused on the strength rather than the mere presence of differential expression. The DESeq2 package is available at http://www. bioconductor.org/packages/release/bioc/html/DESeq2.html.},
	language = {en},
	number = {12},
	urldate = {2018-07-16},
	journal = {Genome Biology},
	author = {Love, Michael I and Huber, Wolfgang and Anders, Simon},
	month = dec,
	year = {2014},
	file = {Love et al. - 2014 - Moderated estimation of fold change and dispersion.pdf:/Users/buyar/Documents/zotero_library/storage/XUERYQ5C/Love et al. - 2014 - Moderated estimation of fold change and dispersion.pdf:application/pdf}
}

@article{reimand_g:profilerweb-based_2007,
	title = {g:{Profiler}—a web-based toolset for functional profiling of gene lists from large-scale experiments},
	volume = {35},
	issn = {0305-1048},
	shorttitle = {g},
	url = {https://academic.oup.com/nar/article/35/suppl_2/W193/2920757},
	doi = {10.1093/nar/gkm226},
	abstract = {Abstract.  g:Profiler (http://biit.cs.ut.ee/gprofiler/) is a public web server for characterising and manipulating gene lists resulting from mining high-through},
	language = {en},
	number = {suppl\_2},
	urldate = {2018-07-16},
	journal = {Nucleic Acids Research},
	author = {Reimand, Jüri and Kull, Meelis and Peterson, Hedi and Hansen, Jaanus and Vilo, Jaak},
	month = jul,
	year = {2007},
	pages = {W193--W200},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/DJSVPIHQ/Reimand et al. - 2007 - gProfiler—a web-based toolset for functional prof.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/RJQRFWNV/2920757.html:text/html}
}

@book{wickham_ggplot2:_2016,
	title = {ggplot2: {Elegant} {Graphics} for {Data} {Analysis}},
	isbn = {978-3-319-24277-4},
	shorttitle = {ggplot2},
	abstract = {This new edition to the classic book by ggplot2 creator Hadley Wickham highlights compatibility with knitr and RStudio. ggplot2 is a data visualization package for R that helps users create data graphics, including those that are multi-layered, with ease. With ggplot2, it's easy to: produce handsome, publication-quality plots with automatic legends created from the plot specificationsuperimpose multiple layers (points, lines, maps, tiles, box plots) from different data sources with automatically adjusted common scalesadd customizable smoothers that use powerful modeling capabilities of R, such as loess, linear models, generalized additive models, and robust regressionsave any ggplot2 plot (or part thereof) for later modification or reusecreate custom themes that capture in-house or journal style requirements and that can easily be applied to multiple plotsapproach a graph from a visual perspective, thinking about how each component of the data is represented on the final plot This book will be useful to everyone who has struggled with displaying data in an informative and attractive way. Some basic knowledge of R is necessary (e.g., importing data into R). ggplot2 is a mini-language specifically tailored for producing graphics, and you'll learn everything you need in the book. After reading this book you'll be able to produce graphics customized precisely for your problems, and you'll find it easy to get graphics out of your head and on to the screen or page.},
	language = {en},
	publisher = {Springer},
	author = {Wickham, Hadley},
	month = jun,
	year = {2016},
	note = {Google-Books-ID: XgFkDAAAQBAJ},
	keywords = {Computers / Computer Graphics, Computers / Mathematical \& Statistical Software, Mathematics / Combinatorics, Mathematics / Graphic Methods, Mathematics / Probability \& Statistics / Stochastic Processes}
}

@article{tang_ggfortify:_2016,
	title = {ggfortify: {Uniﬁed} {Interface} to {Visualize} {Statistical} {Results} of {Popular} {R} {Packages}},
	volume = {8},
	abstract = {The ggfortify package provides a uniﬁed interface that enables users to use one line of code to visualize statistical results of many R packages using ggplot2 idioms. With the help of ggfortify, statisticians, data scientists, and researchers can avoid the sometimes repetitive work of using the ggplot2 syntax to achieve what they need.},
	language = {en},
	author = {Tang, Yuan and Horikoshi, Masaaki and Li, Wenxuan},
	year = {2016},
	pages = {12},
	file = {Tang et al. - 2016 - ggfortify Uniﬁed Interface to Visualize Statistic.pdf:/Users/buyar/Documents/zotero_library/storage/3CWM7TRA/Tang et al. - 2016 - ggfortify Uniﬁed Interface to Visualize Statistic.pdf:application/pdf}
}

@article{gaidatzis_quasr:_2015,
	title = {{QuasR}: quantification and annotation of short reads in {R}},
	volume = {31},
	issn = {1367-4803},
	shorttitle = {{QuasR}},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4382904/},
	doi = {10.1093/bioinformatics/btu781},
	abstract = {Summary: QuasR is a package for the integrated analysis of high-throughput sequencing data in R, covering all steps from read preprocessing, alignment and quality control to quantification. QuasR supports different experiment types (including RNA-seq, ChIP-seq and Bis-seq) and analysis variants (e.g. paired-end, stranded, spliced and allele-specific), and is integrated in Bioconductor so that its output can be directly processed for statistical analysis and visualization., Availability and implementation: QuasR is implemented in R and C/C++. Source code and binaries for major platforms (Linux, OS X and MS Windows) are available from Bioconductor (www.bioconductor.org/packages/release/bioc/html/QuasR.html). The package includes a ‘vignette’ with step-by-step examples for typical work ﬂows., Contact:
michael.stadler@fmi.ch, Supplementary information:
Supplementary data are available at Bioinformatics online.},
	number = {7},
	urldate = {2018-07-16},
	journal = {Bioinformatics},
	author = {Gaidatzis, Dimos and Lerch, Anita and Hahne, Florian and Stadler, Michael B.},
	month = apr,
	year = {2015},
	pmid = {25417205},
	pmcid = {PMC4382904},
	pages = {1130--1132},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/E67LGL5G/Gaidatzis et al. - 2015 - QuasR quantification and annotation of short read.pdf:application/pdf}
}

@article{backman_systempiper:_2016,
	title = {{systemPipeR}: {NGS} workflow and report generation environment},
	volume = {17},
	issn = {1471-2105},
	shorttitle = {{systemPipeR}},
	url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-1241-0},
	doi = {10.1186/s12859-016-1241-0},
	abstract = {Background: Next-generation sequencing (NGS) has revolutionized how research is carried out in many areas of biology and medicine. However, the analysis of NGS data remains a major obstacle to the efficient utilization of the technology, as it requires complex multi-step processing of big data demanding considerable computational expertise from users. While substantial effort has been invested on the development of software dedicated to the individual analysis steps of NGS experiments, insufficient resources are currently available for integrating the individual software components within the widely used R/Bioconductor environment into automated workflows capable of running the analysis of most types of NGS applications from start-to-finish in a time-efficient and reproducible manner.
Results: To address this need, we have developed the R/Bioconductor package systemPipeR. It is an extensible environment for both building and running end-to-end analysis workflows with automated report generation for a wide range of NGS applications. Its unique features include a uniform workflow interface across different NGS applications, automated report generation, and support for running both R and command-line software on local computers and computer clusters. A flexible sample annotation infrastructure efficiently handles complex sample sets and experimental designs. To simplify the analysis of widely used NGS applications, the package provides pre-configured workflows and reporting templates for RNA-Seq, ChIP-Seq, VAR-Seq and Ribo-Seq. Additional workflow templates will be provided in the future.
Conclusions: systemPipeR accelerates the extraction of reproducible analysis results from NGS experiments. By combining the capabilities of many R/Bioconductor and command-line tools, it makes efficient use of existing software resources without limiting the user to a set of predefined methods or environments. systemPipeR is freely available for all common operating systems from Bioconductor (http://bioconductor.org/packages/devel/systemPipeR).},
	language = {en},
	number = {1},
	urldate = {2018-07-16},
	journal = {BMC Bioinformatics},
	author = {Backman, Tyler W. H. and Girke, Thomas},
	month = dec,
	year = {2016},
	file = {Backman and Girke - 2016 - systemPipeR NGS workflow and report generation en.pdf:/Users/buyar/Documents/zotero_library/storage/VJAUQCX2/Backman and Girke - 2016 - systemPipeR NGS workflow and report generation en.pdf:application/pdf}
}

@article{morgan_shortread:_2009,
	title = {{ShortRead}: a bioconductor package for input, quality assessment and exploration of high-throughput sequence data},
	volume = {25},
	issn = {1367-4803},
	shorttitle = {{ShortRead}},
	url = {https://academic.oup.com/bioinformatics/article/25/19/2607/180881},
	doi = {10.1093/bioinformatics/btp450},
	abstract = {Abstract.  Summary:ShortRead is a package for input, quality assessment, manipulation and output of high-throughput sequencing data. ShortRead is provided in th},
	language = {en},
	number = {19},
	urldate = {2018-07-16},
	journal = {Bioinformatics},
	author = {Morgan, Martin and Anders, Simon and Lawrence, Michael and Aboyoun, Patrick and Pagès, Hervé and Gentleman, Robert},
	month = oct,
	year = {2009},
	pages = {2607--2608},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/G9F6NKC5/Morgan et al. - 2009 - ShortRead a bioconductor package for input, quali.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/PWT57IID/180881.html:text/html}
}

@article{liao_subread_2013,
	title = {The {Subread} aligner: fast, accurate and scalable read mapping by seed-and-vote},
	volume = {41},
	issn = {0305-1048},
	shorttitle = {The {Subread} aligner},
	url = {https://academic.oup.com/nar/article/41/10/e108/1075719},
	doi = {10.1093/nar/gkt214},
	abstract = {Abstract.  Read alignment is an ongoing challenge for the analysis of data from sequencing technologies. This article proposes an elegantly simple multi-seed st},
	language = {en},
	number = {10},
	urldate = {2018-07-16},
	journal = {Nucleic Acids Research},
	author = {Liao, Yang and Smyth, Gordon K. and Shi, Wei},
	month = may,
	year = {2013},
	pages = {e108--e108},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/D3PH3PZV/Liao et al. - 2013 - The Subread aligner fast, accurate and scalable r.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/CRBB7CAU/1075719.html:text/html}
}

@article{lawrence_software_2013,
	title = {Software for {Computing} and {Annotating} {Genomic} {Ranges}},
	volume = {9},
	issn = {1553-7358},
	url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003118},
	doi = {10.1371/journal.pcbi.1003118},
	abstract = {We describe Bioconductor infrastructure for representing and computing on annotated genomic ranges and integrating genomic data with the statistical computing features of R and its extensions. At the core of the infrastructure are three packages: IRanges, GenomicRanges, and GenomicFeatures. These packages provide scalable data structures for representing annotated ranges on the genome, with special support for transcript structures, read alignments and coverage vectors. Computational facilities include efficient algorithms for overlap and nearest neighbor detection, coverage calculation and other range operations. This infrastructure directly supports more than 80 other Bioconductor packages, including those for sequence analysis, differential expression analysis and visualization.},
	language = {en},
	number = {8},
	urldate = {2018-07-16},
	journal = {PLOS Computational Biology},
	author = {Lawrence, Michael and Huber, Wolfgang and Pagès, Hervé and Aboyoun, Patrick and Carlson, Marc and Gentleman, Robert and Morgan, Martin T. and Carey, Vincent J.},
	month = aug,
	year = {2013},
	keywords = {Genome analysis, Genome annotation, Genome complexity, Genomic databases, Mammalian genomics, Sequence alignment, Software tools, Structural genomics},
	pages = {e1003118},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/IDM9ABDC/Lawrence et al. - 2013 - Software for Computing and Annotating Genomic Rang.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/CX4YYHZX/article.html:text/html}
}

@article{robinson_edger:_2010,
	title = {{edgeR}: a {Bioconductor} package for differential expression analysis of digital gene expression data},
	volume = {26},
	issn = {1367-4811},
	shorttitle = {{edgeR}},
	doi = {10.1093/bioinformatics/btp616},
	abstract = {SUMMARY: It is expected that emerging digital gene expression (DGE) technologies will overtake microarray technologies in the near future for many functional genomics applications. One of the fundamental data analysis tasks, especially for gene expression studies, involves determining whether there is evidence that counts for a transcript or exon are significantly different across experimental conditions. edgeR is a Bioconductor software package for examining differential expression of replicated count data. An overdispersed Poisson model is used to account for both biological and technical variability. Empirical Bayes methods are used to moderate the degree of overdispersion across transcripts, improving the reliability of inference. The methodology can be used even with the most minimal levels of replication, provided at least one phenotype or experimental condition is replicated. The software may have other applications beyond sequencing data, such as proteome peptide count data.
AVAILABILITY: The package is freely available under the LGPL licence from the Bioconductor web site (http://bioconductor.org).},
	language = {eng},
	number = {1},
	journal = {Bioinformatics (Oxford, England)},
	author = {Robinson, Mark D. and McCarthy, Davis J. and Smyth, Gordon K.},
	month = jan,
	year = {2010},
	pmid = {19910308},
	pmcid = {PMC2796818},
	keywords = {Algorithms, Gene Expression Profiling, Oligonucleotide Array Sequence Analysis, Programming Languages, Signal Processing, Computer-Assisted, Software},
	pages = {139--140}
}

@article{gu_complex_2016,
	title = {Complex heatmaps reveal patterns and correlations in multidimensional genomic data},
	volume = {32},
	issn = {1367-4811},
	doi = {10.1093/bioinformatics/btw313},
	abstract = {Parallel heatmaps with carefully designed annotation graphics are powerful for efficient visualization of patterns and relationships among high dimensional genomic data. Here we present the ComplexHeatmap package that provides rich functionalities for customizing heatmaps, arranging multiple parallel heatmaps and including user-defined annotation graphics. We demonstrate the power of ComplexHeatmap to easily reveal patterns and correlations among multiple sources of information with four real-world datasets.
AVAILABILITY AND IMPLEMENTATION: The ComplexHeatmap package and documentation are freely available from the Bioconductor project: http://www.bioconductor.org/packages/devel/bioc/html/ComplexHeatmap.html
CONTACT: m.schlesner@dkfz.de
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
	language = {eng},
	number = {18},
	journal = {Bioinformatics (Oxford, England)},
	author = {Gu, Zuguang and Eils, Roland and Schlesner, Matthias},
	year = {2016},
	pmid = {27207943},
	keywords = {Computer Graphics, Gene Expression, Genomics, Humans, Metabolic Networks and Pathways, Software},
	pages = {2847--2849}
}

@article{gandolfo_rle_2018,
	title = {{RLE} plots: {Visualizing} unwanted variation in high dimensional data},
	volume = {13},
	issn = {1932-6203},
	shorttitle = {{RLE} plots},
	doi = {10.1371/journal.pone.0191629},
	abstract = {Unwanted variation can be highly problematic and so its detection is often crucial. Relative log expression (RLE) plots are a powerful tool for visualizing such variation in high dimensional data. We provide a detailed examination of these plots, with the aid of examples and simulation, explaining what they are and what they can reveal. RLE plots are particularly useful for assessing whether a procedure aimed at removing unwanted variation, i.e. a normalization procedure, has been successful. These plots, while originally devised for gene expression data from microarrays, can also be used to reveal unwanted variation in many other kinds of high dimensional data, where such variation can be problematic.},
	language = {eng},
	number = {2},
	journal = {PloS One},
	author = {Gandolfo, Luke C. and Speed, Terence P.},
	year = {2018},
	pmid = {29401521},
	pmcid = {PMC5798764},
	keywords = {Computer Simulation, Gene Expression},
	pages = {e0191629}
}

@article{risso_gc-content_2011,
	title = {{GC}-content normalization for {RNA}-{Seq} data},
	volume = {12},
	issn = {1471-2105},
	doi = {10.1186/1471-2105-12-480},
	abstract = {BACKGROUND: Transcriptome sequencing (RNA-Seq) has become the assay of choice for high-throughput studies of gene expression. However, as is the case with microarrays, major technology-related artifacts and biases affect the resulting expression measures. Normalization is therefore essential to ensure accurate inference of expression levels and subsequent analyses thereof.
RESULTS: We focus on biases related to GC-content and demonstrate the existence of strong sample-specific GC-content effects on RNA-Seq read counts, which can substantially bias differential expression analysis. We propose three simple within-lane gene-level GC-content normalization approaches and assess their performance on two different RNA-Seq datasets, involving different species and experimental designs. Our methods are compared to state-of-the-art normalization procedures in terms of bias and mean squared error for expression fold-change estimation and in terms of Type I error and p-value distributions for tests of differential expression. The exploratory data analysis and normalization methods proposed in this article are implemented in the open-source Bioconductor R package EDASeq.
CONCLUSIONS: Our within-lane normalization procedures, followed by between-lane normalization, reduce GC-content bias and lead to more accurate estimates of expression fold-changes and tests of differential expression. Such results are crucial for the biological interpretation of RNA-Seq experiments, where downstream analyses can be sensitive to the supplied lists of genes.},
	language = {eng},
	journal = {BMC bioinformatics},
	author = {Risso, Davide and Schwartz, Katja and Sherlock, Gavin and Dudoit, Sandrine},
	month = dec,
	year = {2011},
	pmid = {22177264},
	pmcid = {PMC3315510},
	keywords = {Base Composition, Gene Expression Profiling, Saccharomyces cerevisiae, Sequence Analysis, RNA, Transcriptome},
	pages = {480}
}

@misc{noauthor_babraham_nodate,
	title = {Babraham {Bioinformatics} - {FastQC} {A} {Quality} {Control} tool for {High} {Throughput} {Sequence} {Data}},
	url = {https://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
	urldate = {2018-07-16},
	file = {Babraham Bioinformatics - FastQC A Quality Control tool for High Throughput Sequence Data:/Users/buyar/Documents/zotero_library/storage/7J7GBCVL/fastqc.html:text/html}
}

@article{bolger_trimmomatic:_2014,
	title = {Trimmomatic: a flexible trimmer for {Illumina} sequence data},
	volume = {30},
	issn = {1367-4803},
	shorttitle = {Trimmomatic},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/},
	doi = {10.1093/bioinformatics/btu170},
	abstract = {Motivation: Although many next-generation sequencing (NGS) read preprocessing tools already existed, we could not find any tool or combination of tools that met our requirements in terms of flexibility, correct handling of paired-end data and high performance. We have developed Trimmomatic as a more flexible and efficient preprocessing tool, which could correctly handle paired-end data., Results: The value of NGS read preprocessing is demonstrated for both reference-based and reference-free tasks. Trimmomatic is shown to produce output that is at least competitive with, and in many cases superior to, that produced by other tools, in all scenarios tested., Availability and implementation: Trimmomatic is licensed under GPL V3. It is cross-platform (Java 1.5+ required) and available at http://www.usadellab.org/cms/index.php?page=trimmomatic, Contact:
usadel@bio1.rwth-aachen.de, Supplementary information:
Supplementary data are available at Bioinformatics online.},
	number = {15},
	urldate = {2018-07-16},
	journal = {Bioinformatics},
	author = {Bolger, Anthony M. and Lohse, Marc and Usadel, Bjoern},
	month = aug,
	year = {2014},
	pmid = {24695404},
	pmcid = {PMC4103590},
	pages = {2114--2120},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/6AQDDN4Z/Bolger et al. - 2014 - Trimmomatic a flexible trimmer for Illumina seque.pdf:application/pdf}
}

@misc{noauthor_babraham_nodate-1,
	title = {Babraham {Bioinformatics} - {Trim} {Galore}!},
	url = {https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/},
	urldate = {2018-07-16},
	file = {Babraham Bioinformatics - Trim Galore!:/Users/buyar/Documents/zotero_library/storage/HVB3FZXH/trim_galore.html:text/html}
}

@article{dobin_star:_2013,
	title = {{STAR}: ultrafast universal {RNA}-seq aligner},
	volume = {29},
	issn = {1367-4803},
	shorttitle = {{STAR}},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/},
	doi = {10.1093/bioinformatics/bts635},
	abstract = {Motivation: Accurate alignment of high-throughput RNA-seq data is a challenging and yet unsolved problem because of the non-contiguous transcript structure, relatively short read lengths and constantly increasing throughput of the sequencing technologies. Currently available RNA-seq aligners suffer from high mapping error rates, low mapping speed, read length limitation and mapping biases., Results: To align our large ({\textgreater}80 billon reads) ENCODE Transcriptome RNA-seq dataset, we developed the Spliced Transcripts Alignment to a Reference (STAR) software based on a previously undescribed RNA-seq alignment algorithm that uses sequential maximum mappable seed search in uncompressed suffix arrays followed by seed clustering and stitching procedure. STAR outperforms other aligners by a factor of {\textgreater}50 in mapping speed, aligning to the human genome 550 million 2 × 76 bp paired-end reads per hour on a modest 12-core server, while at the same time improving alignment sensitivity and precision. In addition to unbiased de novo detection of canonical junctions, STAR can discover non-canonical splices and chimeric (fusion) transcripts, and is also capable of mapping full-length RNA sequences. Using Roche 454 sequencing of reverse transcription polymerase chain reaction amplicons, we experimentally validated 1960 novel intergenic splice junctions with an 80–90\% success rate, corroborating the high precision of the STAR mapping strategy., Availability and implementation: STAR is implemented as a standalone C++ code. STAR is free open source software distributed under GPLv3 license and can be downloaded from http://code.google.com/p/rna-star/., Contact:
dobin@cshl.edu.},
	number = {1},
	urldate = {2018-07-16},
	journal = {Bioinformatics},
	author = {Dobin, Alexander and Davis, Carrie A. and Schlesinger, Felix and Drenkow, Jorg and Zaleski, Chris and Jha, Sonali and Batut, Philippe and Chaisson, Mark and Gingeras, Thomas R.},
	month = jan,
	year = {2013},
	pmid = {23104886},
	pmcid = {PMC3530905},
	pages = {15--21},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/IVLBMHIW/Dobin et al. - 2013 - STAR ultrafast universal RNA-seq aligner.pdf:application/pdf}
}

@article{kim_tophat2:_2013,
	title = {{TopHat}2: accurate alignment of transcriptomes in the presence of insertions, deletions and gene fusions},
	volume = {14},
	issn = {1474-760X},
	shorttitle = {{TopHat}2},
	doi = {10.1186/gb-2013-14-4-r36},
	abstract = {TopHat is a popular spliced aligner for RNA-sequence (RNA-seq) experiments. In this paper, we describe TopHat2, which incorporates many significant enhancements to TopHat. TopHat2 can align reads of various lengths produced by the latest sequencing technologies, while allowing for variable-length indels with respect to the reference genome. In addition to de novo spliced alignment, TopHat2 can align reads across fusion breaks, which can occur after genomic translocations. TopHat2 combines the ability to identify novel splice sites with direct mapping to known transcripts, producing sensitive and accurate alignments, even for highly repetitive genomes or in the presence of pseudogenes. TopHat2 is available at http://ccb.jhu.edu/software/tophat.},
	language = {eng},
	number = {4},
	journal = {Genome Biology},
	author = {Kim, Daehwan and Pertea, Geo and Trapnell, Cole and Pimentel, Harold and Kelley, Ryan and Salzberg, Steven L.},
	month = apr,
	year = {2013},
	pmid = {23618408},
	pmcid = {PMC4053844},
	keywords = {Gene Duplication, Gene Fusion, Humans, Mutagenesis, Insertional, Sensitivity and Specificity, Sequence Alignment, Sequence Analysis, RNA, Software, Transcriptome},
	pages = {R36}
}

@article{kim_hisat:_2015,
	title = {{HISAT}: a fast spliced aligner with low memory requirements},
	volume = {12},
	issn = {1548-7091, 1548-7105},
	shorttitle = {{HISAT}},
	url = {http://www.nature.com/articles/nmeth.3317},
	doi = {10.1038/nmeth.3317},
	language = {en},
	number = {4},
	urldate = {2018-07-16},
	journal = {Nature Methods},
	author = {Kim, Daehwan and Langmead, Ben and Salzberg, Steven L},
	month = apr,
	year = {2015},
	pages = {357--360},
	file = {Kim et al. - 2015 - HISAT a fast spliced aligner with low memory requ.pdf:/Users/buyar/Documents/zotero_library/storage/EPTDHZG2/Kim et al. - 2015 - HISAT a fast spliced aligner with low memory requ.pdf:application/pdf}
}

@article{wu_gmap_2016,
	title = {{GMAP} and {GSNAP} for {Genomic} {Sequence} {Alignment}: {Enhancements} to {Speed}, {Accuracy}, and {Functionality}},
	volume = {1418},
	issn = {1940-6029},
	shorttitle = {{GMAP} and {GSNAP} for {Genomic} {Sequence} {Alignment}},
	doi = {10.1007/978-1-4939-3578-9_15},
	abstract = {The programs GMAP and GSNAP, for aligning RNA-Seq and DNA-Seq datasets to genomes, have evolved along with advances in biological methodology to handle longer reads, larger volumes of data, and new types of biological assays. The genomic representation has been improved to include linear genomes that can compare sequences using single-instruction multiple-data (SIMD) instructions, compressed genomic hash tables with fast access using SIMD instructions, handling of large genomes with more than four billion bp, and enhanced suffix arrays (ESAs) with novel data structures for fast access. Improvements to the algorithms have included a greedy match-and-extend algorithm using suffix arrays, segment chaining using genomic hash tables, diagonalization using segmental hash tables, and nucleotide-level dynamic programming procedures that use SIMD instructions and eliminate the need for F-loop calculations. Enhancements to the functionality of the programs include standardization of indel positions, handling of ambiguous splicing, clipping and merging of overlapping paired-end reads, and alignments to circular chromosomes and alternate scaffolds. The programs have been adapted for use in pipelines by integrating their usage into R/Bioconductor packages such as gmapR and HTSeqGenie, and these pipelines have facilitated the discovery of numerous biological phenomena.},
	language = {eng},
	journal = {Methods in Molecular Biology (Clifton, N.J.)},
	author = {Wu, Thomas D. and Reeder, Jens and Lawrence, Michael and Becker, Gabe and Brauer, Matthew J.},
	year = {2016},
	pmid = {27008021},
	keywords = {Algorithms, Bioinformatics algorithms, Computational Biology, Databases, Genetic, DNA-seq, Gene Expression Profiling, Genome, Genomic alignment, Genomic mapping, Genomics, High-Throughput Nucleotide Sequencing, Next-generation sequencing, Polymorphism, Single Nucleotide, Reproducibility of Results, RNA Editing, RNA Splicing, RNA-seq, Sequence Alignment, Sequence analysis, Sequence Analysis, DNA, Software, Time Factors, Transcriptome analysis},
	pages = {283--334}
}

@article{patro_salmon:_2017,
	title = {Salmon: fast and bias-aware quantification of transcript expression using dual-phase inference},
	volume = {14},
	issn = {1548-7091},
	shorttitle = {Salmon},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5600148/},
	doi = {10.1038/nmeth.4197},
	abstract = {We introduce Salmon, a method for quantifying transcript abundance from RNA-seq reads that is accurate and fast. Salmon is the first transcriptome-wide quantifier to correct for fragment GC content bias, which we demonstrate substantially improves the accuracy of abundance estimates and the reliability of subsequent differential expression analysis. Salmon combines a new dual-phase parallel inference algorithm and feature-rich bias models with an ultra-fast read mapping procedure.},
	number = {4},
	urldate = {2018-07-16},
	journal = {Nature methods},
	author = {Patro, Rob and Duggal, Geet and Love, Michael I and Irizarry, Rafael A and Kingsford, Carl},
	month = apr,
	year = {2017},
	pmid = {28263959},
	pmcid = {PMC5600148},
	pages = {417--419},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/JNYLGN5H/Patro et al. - 2017 - Salmon fast and bias-aware quantification of tran.pdf:application/pdf}
}

@article{bray_near-optimal_2016,
	title = {Near-optimal probabilistic {RNA}-seq quantification},
	volume = {34},
	copyright = {2016 Nature Publishing Group},
	issn = {1546-1696},
	url = {https://www.nature.com/articles/nbt.3519},
	doi = {10.1038/nbt.3519},
	abstract = {We present kallisto, an RNA-seq quantification program that is two orders of magnitude faster than previous approaches and achieves similar accuracy. Kallisto pseudoaligns reads to a reference, producing a list of transcripts that are compatible with each read while avoiding alignment of individual bases. We use kallisto to analyze 30 million unaligned paired-end RNA-seq reads in {\textless}10 min on a standard laptop computer. This removes a major computational bottleneck in RNA-seq analysis.},
	language = {en},
	number = {5},
	urldate = {2018-07-16},
	journal = {Nature Biotechnology},
	author = {Bray, Nicolas L. and Pimentel, Harold and Melsted, Páll and Pachter, Lior},
	month = may,
	year = {2016},
	pages = {525--527},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/K3H3EIPL/Bray et al. - 2016 - Near-optimal probabilistic RNA-seq quantification.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/LQ4Z54PT/nbt.html:text/html}
}

@article{patro_sailfish_2014,
	title = {Sailfish enables alignment-free isoform quantification from {RNA}-seq reads using lightweight algorithms},
	volume = {32},
	copyright = {2014 Nature Publishing Group},
	issn = {1546-1696},
	url = {https://www.nature.com/articles/nbt.2862},
	doi = {10.1038/nbt.2862},
	abstract = {We introduce Sailfish, a computational method for quantifying the abundance of previously annotated RNA isoforms from RNA-seq data. Because Sailfish entirely avoids mapping reads, a time-consuming step in all current methods, it provides quantification estimates much faster than do existing approaches (typically 20 times faster) without loss of accuracy. By facilitating frequent reanalysis of data and reducing the need to optimize parameters, Sailfish exemplifies the potential of lightweight algorithms for efficiently processing sequencing reads.},
	language = {en},
	number = {5},
	urldate = {2018-07-16},
	journal = {Nature Biotechnology},
	author = {Patro, Rob and Mount, Stephen M. and Kingsford, Carl},
	month = may,
	year = {2014},
	pages = {462--464},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/UCA63GMJ/Patro et al. - 2014 - Sailfish enables alignment-free isoform quantifica.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/7D76TGMM/nbt.html:text/html}
}

@article{haas_novo_2013,
	title = {De novo transcript sequence reconstruction from {RNA}-{Seq}: reference generation and analysis with {Trinity}},
	volume = {8},
	issn = {1754-2189},
	shorttitle = {De novo transcript sequence reconstruction from {RNA}-{Seq}},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3875132/},
	doi = {10.1038/nprot.2013.084},
	abstract = {De novo assembly of RNA-Seq data allows us to study transcriptomes without the need for a genome sequence, such as in non-model organisms of ecological and evolutionary importance, cancer samples, or the microbiome. In this protocol, we describe the use of the Trinity platform for de novo transcriptome assembly from RNA-Seq data in non-model organisms. We also present Trinity’s supported companion utilities for downstream applications, including RSEM for transcript abundance estimation, R/Bioconductor packages for identifying differentially expressed transcripts across samples, and approaches to identify protein coding genes. In an included tutorial we provide a workflow for genome-independent transcriptome analysis leveraging the Trinity platform. The software, documentation and demonstrations are freely available from http://trinityrnaseq.sf.net.},
	number = {8},
	urldate = {2018-07-16},
	journal = {Nature protocols},
	author = {Haas, Brian J. and Papanicolaou, Alexie and Yassour, Moran and Grabherr, Manfred and Blood, Philip D. and Bowden, Joshua and Couger, Matthew Brian and Eccles, David and Li, Bo and Lieber, Matthias and MacManes, Matthew D. and Ott, Michael and Orvis, Joshua and Pochet, Nathalie and Strozzi, Francesco and Weeks, Nathan and Westerman, Rick and William, Thomas and Dewey, Colin N. and Henschel, Robert and LeDuc, Richard D. and Friedman, Nir and Regev, Aviv},
	month = aug,
	year = {2013},
	pmid = {23845962},
	pmcid = {PMC3875132},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/TVJY4NY5/Haas et al. - 2013 - De novo transcript sequence reconstruction from RN.pdf:application/pdf}
}

@article{robertson_novo_2010,
	title = {\textit{{De} novo} assembly and analysis of {RNA}-seq data},
	volume = {7},
	copyright = {2010 Nature Publishing Group},
	issn = {1548-7105},
	url = {https://www.nature.com/articles/nmeth.1517},
	doi = {10.1038/nmeth.1517},
	abstract = {We describe Trans-ABySS, a de novo short-read transcriptome assembly and analysis pipeline that addresses variation in local read densities by assembling read substrings with varying stringencies and then merging the resulting contigs before analysis. Analyzing 7.4 gigabases of 50-base-pair paired-end Illumina reads from an adult mouse liver poly(A) RNA library, we identified known, new and alternative structures in expressed transcripts, and achieved high sensitivity and specificity relative to reference-based assembly methods.},
	language = {en},
	number = {11},
	urldate = {2018-07-16},
	journal = {Nature Methods},
	author = {Robertson, Gordon and Schein, Jacqueline and Chiu, Readman and Corbett, Richard and Field, Matthew and Jackman, Shaun D. and Mungall, Karen and Lee, Sam and Okada, Hisanaga Mark and Qian, Jenny Q. and Griffith, Malachi and Raymond, Anthony and Thiessen, Nina and Cezard, Timothee and Butterfield, Yaron S. and Newsome, Richard and Chan, Simon K. and She, Rong and Varhol, Richard and Kamoh, Baljit and Prabhu, Anna-Liisa and Tam, Angela and Zhao, YongJun and Moore, Richard A. and Hirst, Martin and Marra, Marco A. and Jones, Steven J. M. and Hoodless, Pamela A. and Birol, Inanc},
	month = nov,
	year = {2010},
	pages = {909--912},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/RPL8YT7T/Robertson et al. - 2010 - iDe novoi assembly and analysis of RNA-seq da.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/U9FA9IWI/nmeth.html:text/html}
}

@article{leek_sva_2012,
	title = {The sva package for removing batch effects and other unwanted variation in high-throughput experiments},
	volume = {28},
	issn = {1367-4803},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3307112/},
	doi = {10.1093/bioinformatics/bts034},
	abstract = {Summary: Heterogeneity and latent variables are now widely recognized as major sources of bias and variability in high-throughput experiments. The most well-known source of latent variation in genomic experiments are batch effects—when samples are processed on different days, in different groups or by different people. However, there are also a large number of other variables that may have a major impact on high-throughput measurements. Here we describe the sva package for identifying, estimating and removing unwanted sources of variation in high-throughput experiments. The sva package supports surrogate variable estimation with the sva function, direct adjustment for known batch effects with the ComBat function and adjustment for batch and latent variables in prediction problems with the fsva function., Availability: The R package sva is freely available from http://www.bioconductor.org., Contact:
jleek@jhsph.edu, Supplementary information:
Supplementary data are available at Bioinformatics online.},
	number = {6},
	urldate = {2018-07-16},
	journal = {Bioinformatics},
	author = {Leek, Jeffrey T. and Johnson, W. Evan and Parker, Hilary S. and Jaffe, Andrew E. and Storey, John D.},
	month = mar,
	year = {2012},
	pmid = {22257669},
	pmcid = {PMC3307112},
	pages = {882--883},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/A66LXLHH/Leek et al. - 2012 - The sva package for removing batch effects and oth.pdf:application/pdf}
}

@article{jiang_synthetic_2011,
	title = {Synthetic spike-in standards for {RNA}-seq experiments},
	volume = {21},
	issn = {1088-9051},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3166838/},
	doi = {10.1101/gr.121095.111},
	abstract = {High-throughput sequencing of cDNA (RNA-seq) is a widely deployed transcriptome profiling and annotation technique, but questions about the performance of different protocols and platforms remain. We used a newly developed pool of 96 synthetic RNAs with various lengths, and GC content covering a 220 concentration range as spike-in controls to measure sensitivity, accuracy, and biases in RNA-seq experiments as well as to derive standard curves for quantifying the abundance of transcripts. We observed linearity between read density and RNA input over the entire detection range and excellent agreement between replicates, but we observed significantly larger imprecision than expected under pure Poisson sampling errors. We use the control RNAs to directly measure reproducible protocol-dependent biases due to GC content and transcript length as well as stereotypic heterogeneity in coverage across transcripts correlated with position relative to RNA termini and priming sequence bias. These effects lead to biased quantification for short transcripts and individual exons, which is a serious problem for measurements of isoform abundances, but that can partially be corrected using appropriate models of bias. By using the control RNAs, we derive limits for the discovery and detection of rare transcripts in RNA-seq experiments. By using data collected as part of the model organism and human Encyclopedia of DNA Elements projects (ENCODE and modENCODE), we demonstrate that external RNA controls are a useful resource for evaluating sensitivity and accuracy of RNA-seq experiments for transcriptome discovery and quantification. These quality metrics facilitate comparable analysis across different samples, protocols, and platforms.},
	number = {9},
	urldate = {2018-07-16},
	journal = {Genome Research},
	author = {Jiang, Lichun and Schlesinger, Felix and Davis, Carrie A. and Zhang, Yu and Li, Renhua and Salit, Marc and Gingeras, Thomas R. and Oliver, Brian},
	month = sep,
	year = {2011},
	pmid = {21816910},
	pmcid = {PMC3166838},
	pages = {1543--1551},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/ZUS294V2/Jiang et al. - 2011 - Synthetic spike-in standards for RNA-seq experimen.pdf:application/pdf}
}

@article{subramanian_gene_2005,
	title = {Gene set enrichment analysis: {A} knowledge-based approach for interpreting genome-wide expression profiles},
	volume = {102},
	copyright = {Copyright © 2005, The National Academy of Sciences.  Freely available online through the PNAS open access option.},
	issn = {0027-8424, 1091-6490},
	shorttitle = {Gene set enrichment analysis},
	url = {http://www.pnas.org/content/102/43/15545},
	doi = {10.1073/pnas.0506580102},
	abstract = {Although genomewide RNA expression analysis has become a routine tool in biomedical research, extracting biological insight from such information remains a major challenge. Here, we describe a powerful analytical method called Gene Set Enrichment Analysis (GSEA) for interpreting gene expression data. The method derives its power by focusing on gene sets, that is, groups of genes that share common biological function, chromosomal location, or regulation. We demonstrate how GSEA yields insights into several cancer-related data sets, including leukemia and lung cancer. Notably, where single-gene analysis finds little similarity between two independent studies of patient survival in lung cancer, GSEA reveals many biological pathways in common. The GSEA method is embodied in a freely available software package, together with an initial database of 1,325 biologically defined gene sets.},
	language = {en},
	number = {43},
	urldate = {2018-07-16},
	journal = {Proceedings of the National Academy of Sciences},
	author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.},
	month = oct,
	year = {2005},
	pmid = {16199517},
	keywords = {microarray},
	pages = {15545--15550},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/JJRZ6NL2/Subramanian et al. - 2005 - Gene set enrichment analysis A knowledge-based ap.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/SRUUGMZ9/15545.html:text/html}
}

@article{kanehisa_kegg_2016,
	title = {{KEGG} as a reference resource for gene and protein annotation},
	volume = {44},
	issn = {0305-1048},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4702792/},
	doi = {10.1093/nar/gkv1070},
	abstract = {KEGG (http://www.kegg.jp/ or http://www.genome.jp/kegg/) is an integrated database resource for biological interpretation of genome sequences and other high-throughput data. Molecular functions of genes and proteins are associated with ortholog groups and stored in the KEGG Orthology (KO) database. The KEGG pathway maps, BRITE hierarchies and KEGG modules are developed as networks of KO nodes, representing high-level functions of the cell and the organism. Currently, more than 4000 complete genomes are annotated with KOs in the KEGG GENES database, which can be used as a reference data set for KO assignment and subsequent reconstruction of KEGG pathways and other molecular networks. As an annotation resource, the following improvements have been made. First, each KO record is re-examined and associated with protein sequence data used in experiments of functional characterization. Second, the GENES database now includes viruses, plasmids, and the addendum category for functionally characterized proteins that are not represented in complete genomes. Third, new automatic annotation servers, BlastKOALA and GhostKOALA, are made available utilizing the non-redundant pangenome data set generated from the GENES database. As a resource for translational bioinformatics, various data sets are created for antimicrobial resistance and drug interaction networks.},
	number = {Database issue},
	urldate = {2018-07-16},
	journal = {Nucleic Acids Research},
	author = {Kanehisa, Minoru and Sato, Yoko and Kawashima, Masayuki and Furumichi, Miho and Tanabe, Mao},
	month = jan,
	year = {2016},
	pmid = {26476454},
	pmcid = {PMC4702792},
	pages = {D457--D462},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/7YIHMZHL/Kanehisa et al. - 2016 - KEGG as a reference resource for gene and protein .pdf:application/pdf}
}

@article{fabregat_reactome_2018,
	title = {The {Reactome} {Pathway} {Knowledgebase}},
	volume = {46},
	issn = {1362-4962},
	doi = {10.1093/nar/gkx1132},
	abstract = {The Reactome Knowledgebase (https://reactome.org) provides molecular details of signal transduction, transport, DNA replication, metabolism, and other cellular processes as an ordered network of molecular transformations-an extended version of a classic metabolic map, in a single consistent data model. Reactome functions both as an archive of biological processes and as a tool for discovering unexpected functional relationships in data such as gene expression profiles or somatic mutation catalogues from tumor cells. To support the continued brisk growth in the size and complexity of Reactome, we have implemented a graph database, improved performance of data analysis tools, and designed new data structures and strategies to boost diagram viewer performance. To make our website more accessible to human users, we have improved pathway display and navigation by implementing interactive Enhanced High Level Diagrams (EHLDs) with an associated icon library, and subpathway highlighting and zooming, in a simplified and reorganized web site with adaptive design. To encourage re-use of our content, we have enabled export of pathway diagrams as 'PowerPoint' files.},
	language = {eng},
	number = {D1},
	journal = {Nucleic Acids Research},
	author = {Fabregat, Antonio and Jupe, Steven and Matthews, Lisa and Sidiropoulos, Konstantinos and Gillespie, Marc and Garapati, Phani and Haw, Robin and Jassal, Bijay and Korninger, Florian and May, Bruce and Milacic, Marija and Roca, Corina Duenas and Rothfels, Karen and Sevilla, Cristoffer and Shamovsky, Veronica and Shorser, Solomon and Varusai, Thawfeek and Viteri, Guilherme and Weiser, Joel and Wu, Guanming and Stein, Lincoln and Hermjakob, Henning and D'Eustachio, Peter},
	month = jan,
	year = {2018},
	pmid = {29145629},
	pmcid = {PMC5753187},
	pages = {D649--D655}
}

@article{luo_gage:_2009,
	title = {{GAGE}: generally applicable gene set enrichment for pathway analysis},
	volume = {10},
	issn = {1471-2105},
	shorttitle = {{GAGE}},
	url = {http://www.biomedcentral.com/1471-2105/10/161},
	doi = {10.1186/1471-2105-10-161},
	abstract = {Background: Gene set analysis (GSA) is a widely used strategy for gene expression data analysis based on pathway knowledge. GSA focuses on sets of related genes and has established major advantages over individual gene analyses, including greater robustness, sensitivity and biological relevance. However, previous GSA methods have limited usage as they cannot handle datasets of different sample sizes or experimental designs.
Results: To address these limitations, we present a new GSA method called Generally Applicable Gene-set Enrichment (GAGE). We successfully apply GAGE to multiple microarray datasets with different sample sizes, experimental designs and profiling techniques. GAGE shows significantly better results when compared to two other commonly used GSA methods of GSEA and PAGE. We demonstrate this improvement in the following three aspects: (1) consistency across repeated studies/experiments; (2) sensitivity and specificity; (3) biological relevance of the regulatory mechanisms inferred. GAGE reveals novel and relevant regulatory mechanisms from both published and previously unpublished microarray studies. From two published lung cancer data sets, GAGE derived a more cohesive and predictive mechanistic scheme underlying lung cancer progress and metastasis. For a previously unpublished BMP6 study, GAGE predicted novel regulatory mechanisms for BMP6 induced osteoblast differentiation, including the canonical BMP-TGF beta signaling, JAK-STAT signaling, Wnt signaling, and estrogen signaling pathways–all of which are supported by the experimental literature.
Conclusion: GAGE is generally applicable to gene expression datasets with different sample sizes and experimental designs. GAGE consistently outperformed two most frequently used GSA methods and inferred statistically and biologically more relevant regulatory pathways. The GAGE method is implemented in R in the "gage" package, available under the GNU GPL from http:// sysbio.engin.umich.edu/{\textasciitilde}luow/downloads.php.},
	language = {en},
	number = {1},
	urldate = {2018-07-16},
	journal = {BMC Bioinformatics},
	author = {Luo, Weijun and Friedman, Michael S and Shedden, Kerby and Hankenson, Kurt D and Woolf, Peter J},
	year = {2009},
	pages = {161},
	file = {Luo et al. - 2009 - GAGE generally applicable gene set enrichment for.pdf:/Users/buyar/Documents/zotero_library/storage/SIPWP7K4/Luo et al. - 2009 - GAGE generally applicable gene set enrichment for.pdf:application/pdf}
}

@article{anders_detecting_2012,
	title = {Detecting differential usage of exons from {RNA}-seq data},
	volume = {22},
	issn = {1088-9051, 1549-5469},
	url = {http://genome.cshlp.org/content/22/10/2008},
	doi = {10.1101/gr.133744.111},
	abstract = {RNA-seq is a powerful tool for the study of alternative splicing and other forms of alternative isoform expression. Understanding the regulation of these processes requires sensitive and specific detection of differential isoform abundance in comparisons between conditions, cell types, or tissues. We present DEXSeq, a statistical method to test for differential exon usage in RNA-seq data. DEXSeq uses generalized linear models and offers reliable control of false discoveries by taking biological variation into account. DEXSeq detects with high sensitivity genes, and in many cases exons, that are subject to differential exon usage. We demonstrate the versatility of DEXSeq by applying it to several data sets. The method facilitates the study of regulation and function of alternative exon usage on a genome-wide scale. An implementation of DEXSeq is available as an R/Bioconductor package.},
	language = {en},
	number = {10},
	urldate = {2018-07-16},
	journal = {Genome Research},
	author = {Anders, Simon and Reyes, Alejandro and Huber, Wolfgang},
	month = oct,
	year = {2012},
	pmid = {22722343},
	pages = {2008--2017},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/6TXWBJXP/Anders et al. - 2012 - Detecting differential usage of exons from RNA-seq.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/GZSME3Q2/2008.html:text/html}
}

@article{mckenna_genome_2010,
	title = {The {Genome} {Analysis} {Toolkit}: {A} {MapReduce} framework for analyzing next-generation {DNA} sequencing data},
	volume = {20},
	issn = {1088-9051, 1549-5469},
	shorttitle = {The {Genome} {Analysis} {Toolkit}},
	url = {http://genome.cshlp.org/content/20/9/1297},
	doi = {10.1101/gr.107524.110},
	abstract = {Next-generation DNA sequencing (NGS) projects, such as the 1000 Genomes Project, are already revolutionizing our understanding of genetic variation among individuals. However, the massive data sets generated by NGS—the 1000 Genome pilot alone includes nearly five terabases—make writing feature-rich, efficient, and robust analysis tools difficult for even computationally sophisticated individuals. Indeed, many professionals are limited in the scope and the ease with which they can answer scientific questions by the complexity of accessing and manipulating the data produced by these machines. Here, we discuss our Genome Analysis Toolkit (GATK), a structured programming framework designed to ease the development of efficient and robust analysis tools for next-generation DNA sequencers using the functional programming philosophy of MapReduce. The GATK provides a small but rich set of data access patterns that encompass the majority of analysis tool needs. Separating specific analysis calculations from common data management infrastructure enables us to optimize the GATK framework for correctness, stability, and CPU and memory efficiency and to enable distributed and shared memory parallelization. We highlight the capabilities of the GATK by describing the implementation and application of robust, scale-tolerant tools like coverage calculators and single nucleotide polymorphism (SNP) calling. We conclude that the GATK programming framework enables developers and analysts to quickly and easily write efficient and robust NGS tools, many of which have already been incorporated into large-scale sequencing projects like the 1000 Genomes Project and The Cancer Genome Atlas.},
	language = {en},
	number = {9},
	urldate = {2018-07-16},
	journal = {Genome Research},
	author = {McKenna, Aaron and Hanna, Matthew and Banks, Eric and Sivachenko, Andrey and Cibulskis, Kristian and Kernytsky, Andrew and Garimella, Kiran and Altshuler, David and Gabriel, Stacey and Daly, Mark and DePristo, Mark A.},
	month = sep,
	year = {2010},
	pmid = {20644199},
	pages = {1297--1303},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/JSKZINKF/McKenna et al. - 2010 - The Genome Analysis Toolkit A MapReduce framework.pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/7D2J9IV7/1297.html:text/html}
}

@article{stanke_augustus:_2005,
	title = {{AUGUSTUS}: a web server for gene prediction in eukaryotes that allows user-defined constraints},
	volume = {33},
	issn = {0305-1048},
	shorttitle = {{AUGUSTUS}},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1160219/},
	doi = {10.1093/nar/gki458},
	abstract = {We present a WWW server for AUGUSTUS, a software for gene prediction in eukaryotic genomic sequences that is based on a generalized hidden Markov model, a probabilistic model of a sequence and its gene structure. The web server allows the user to impose constraints on the predicted gene structure. A constraint can specify the position of a splice site, a translation initiation site or a stop codon. Furthermore, it is possible to specify the position of known exons and intervals that are known to be exonic or intronic sequence. The number of constraints is arbitrary and constraints can be combined in order to pin down larger parts of the predicted gene structure. The result then is the most likely gene structure that complies with all given user constraints, if such a gene structure exists. The specification of constraints is useful when part of the gene structure is known, e.g. by expressed sequence tag or protein sequence alignments, or if the user wants to change the default prediction. The web interface and the downloadable stand-alone program are available free of charge at .},
	number = {Web Server issue},
	urldate = {2018-07-16},
	journal = {Nucleic Acids Research},
	author = {Stanke, Mario and Morgenstern, Burkhard},
	month = jul,
	year = {2005},
	pmid = {15980513},
	pmcid = {PMC1160219},
	pages = {W465--W467},
	file = {PubMed Central Full Text PDF:/Users/buyar/Documents/zotero_library/storage/P26W6YFB/Stanke and Morgenstern - 2005 - AUGUSTUS a web server for gene prediction in euka.pdf:application/pdf}
}

@article{mcpherson_defuse:_2011,
	title = {{deFuse}: {An} {Algorithm} for {Gene} {Fusion} {Discovery} in {Tumor} {RNA}-{Seq} {Data}},
	volume = {7},
	issn = {1553-7358},
	shorttitle = {{deFuse}},
	url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1001138},
	doi = {10.1371/journal.pcbi.1001138},
	abstract = {Gene fusions created by somatic genomic rearrangements are known to play an important role in the onset and development of some cancers, such as lymphomas and sarcomas. RNA-Seq (whole transcriptome shotgun sequencing) is proving to be a useful tool for the discovery of novel gene fusions in cancer transcriptomes. However, algorithmic methods for the discovery of gene fusions using RNA-Seq data remain underdeveloped. We have developed deFuse, a novel computational method for fusion discovery in tumor RNA-Seq data. Unlike existing methods that use only unique best-hit alignments and consider only fusion boundaries at the ends of known exons, deFuse considers all alignments and all possible locations for fusion boundaries. As a result, deFuse is able to identify fusion sequences with demonstrably better sensitivity than previous approaches. To increase the specificity of our approach, we curated a list of 60 true positive and 61 true negative fusion sequences (as confirmed by RT-PCR), and have trained an adaboost classifier on 11 novel features of the sequence data. The resulting classifier has an estimated value of 0.91 for the area under the ROC curve. We have used deFuse to discover gene fusions in 40 ovarian tumor samples, one ovarian cancer cell line, and three sarcoma samples. We report herein the first gene fusions discovered in ovarian cancer. We conclude that gene fusions are not infrequent events in ovarian cancer and that these events have the potential to substantially alter the expression patterns of the genes involved; gene fusions should therefore be considered in efforts to comprehensively characterize the mutational profiles of ovarian cancer transcriptomes.},
	language = {en},
	number = {5},
	urldate = {2018-07-16},
	journal = {PLOS Computational Biology},
	author = {McPherson, Andrew and Hormozdiari, Fereydoun and Zayed, Abdalnasser and Giuliany, Ryan and Ha, Gavin and Sun, Mark G. F. and Griffith, Malachi and Moussavi, Alireza Heravi and Senz, Janine and Melnyk, Nataliya and Pacheco, Marina and Marra, Marco A. and Hirst, Martin and Nielsen, Torsten O. and Sahinalp, S. Cenk and Huntsman, David and Shah, Sohrab P.},
	month = may,
	year = {2011},
	keywords = {Cell fusion, Gene fusion, Genomic libraries, Multiple alignment calculation, Ovarian cancer, RNA sequencing, Sarcomas, Sequence alignment},
	pages = {e1001138},
	file = {Full Text PDF:/Users/buyar/Documents/zotero_library/storage/GE8LJ9KF/McPherson et al. - 2011 - deFuse An Algorithm for Gene Fusion Discovery in .pdf:application/pdf;Snapshot:/Users/buyar/Documents/zotero_library/storage/3HYBJ4AD/article.html:text/html}
}

@article{mortazavi_mapping_2008,
	title = {Mapping and quantifying mammalian transcriptomes by {RNA}-{Seq}},
	volume = {5},
	issn = {1548-7105},
	doi = {10.1038/nmeth.1226},
	abstract = {We have mapped and quantified mouse transcriptomes by deeply sequencing them and recording how frequently each gene is represented in the sequence sample (RNA-Seq). This provides a digital measure of the presence and prevalence of transcripts from known and previously unknown genes. We report reference measurements composed of 41-52 million mapped 25-base-pair reads for poly(A)-selected RNA from adult mouse brain, liver and skeletal muscle tissues. We used RNA standards to quantify transcript prevalence and to test the linear range of transcript detection, which spanned five orders of magnitude. Although {\textgreater}90\% of uniquely mapped reads fell within known exons, the remaining data suggest new and revised gene models, including changed or additional promoters, exons and 3' untranscribed regions, as well as new candidate microRNA precursors. RNA splice events, which are not readily measured by standard gene expression microarray or serial analysis of gene expression methods, were detected directly by mapping splice-crossing sequence reads. We observed 1.45 x 10(5) distinct splices, and alternative splices were prominent, with 3,500 different genes expressing one or more alternate internal splices.},
	language = {eng},
	number = {7},
	journal = {Nature Methods},
	author = {Mortazavi, Ali and Williams, Brian A. and McCue, Kenneth and Schaeffer, Lorian and Wold, Barbara},
	month = jul,
	year = {2008},
	pmid = {18516045},
	keywords = {3' Untranslated Regions, Algorithms, Alternative Splicing, Animals, Brain, Chromosome Mapping, Databases, Nucleic Acid, Exons, Gene Expression Profiling, Liver, Mice, Mice, Inbred C57BL, Muscle, Skeletal, Oligonucleotide Array Sequence Analysis, Promoter Regions, Genetic, RNA, RNA Splicing, RNA, Messenger, Sensitivity and Specificity, Sequence Analysis, RNA, Software},
	pages = {621--628}
}

@Article{tcga_pan_cancer,
   Author="Weinstein, J. N.  and Collisson, E. A.  and Mills, G. B.  and Shaw, K. R.  and Ozenberger, B. A.  and Ellrott, K.  and Shmulevich, I.  and Sander, C.  and Stuart, J. M.  and Chang, K.  and Creighton, C. J.  and Davis, C.  and Donehower, L.  and Drummond, J.  and Wheeler, D.  and Ally, A.  and Balasundaram, M.  and Birol, I.  and Butterfield, S. N.  and Chu, A.  and Chuah, E.  and Chun, H. J.  and Dhalla, N.  and Guin, R.  and Hirst, M.  and Hirst, C.  and Holt, R. A.  and Jones, S. J.  and Lee, D.  and Li, H. I.  and Marra, M. A.  and Mayo, M.  and Moore, R. A.  and Mungall, A. J.  and Robertson, A. G.  and Schein, J. E.  and Sipahimalani, P.  and Tam, A.  and Thiessen, N.  and Varhol, R. J.  and Beroukhim, R.  and Bhatt, A. S.  and Brooks, A. N.  and Cherniack, A. D.  and Freeman, S. S.  and Gabriel, S. B.  and Helman, E.  and Jung, J.  and Meyerson, M.  and Ojesina, A. I.  and Pedamallu, C. S.  and Saksena, G.  and Schumacher, S. E.  and Tabak, B.  and Zack, T.  and Lander, E. S.  and Bristow, C. A.  and Hadjipanayis, A.  and Haseley, P.  and Kucherlapati, R.  and Lee, S.  and Lee, E.  and Luquette, L. J.  and Mahadeshwar, H. S.  and Pantazi, A.  and Parfenov, M.  and Park, P. J.  and Protopopov, A.  and Ren, X.  and Santoso, N.  and Seidman, J.  and Seth, S.  and Song, X.  and Tang, J.  and Xi, R.  and Xu, A. W.  and Yang, L.  and Zeng, D.  and Auman, J. T.  and Balu, S.  and Buda, E.  and Fan, C.  and Hoadley, K. A.  and Jones, C. D.  and Meng, S.  and Mieczkowski, P. A.  and Parker, J. S.  and Perou, C. M.  and Roach, J.  and Shi, Y.  and Silva, G. O.  and Tan, D.  and Veluvolu, U.  and Waring, S.  and Wilkerson, M. D.  and Wu, J.  and Zhao, W.  and Bodenheimer, T.  and Hayes, D. N.  and Hoyle, A. P.  and Jeffreys, S. R.  and Mose, L. E.  and Simons, J. V.  and Soloway, M. G.  and Baylin, S. B.  and Berman, B. P.  and Bootwalla, M. S.  and Danilova, L.  and Herman, J. G.  and Hinoue, T.  and Laird, P. W.  and Rhie, S. K.  and Shen, H.  and Triche, T.  and Weisenberger, D. J.  and Carter, S. L.  and Cibulskis, K.  and Chin, L.  and Zhang, J.  and Getz, G.  and Sougnez, C.  and Wang, M.  and Saksena, G.  and Carter, S. L.  and Cibulskis, K.  and Chin, L.  and Zhang, J.  and Getz, G.  and Dinh, H.  and Doddapaneni, H. V.  and Gibbs, R.  and Gunaratne, P.  and Han, Y.  and Kalra, D.  and Kovar, C.  and Lewis, L.  and Morgan, M.  and Morton, D.  and Muzny, D.  and Reid, J.  and Xi, L.  and Cho, J.  and DiCara, D.  and Frazer, S.  and Gehlenborg, N.  and Heiman, D. I.  and Kim, J.  and Lawrence, M. S.  and Lin, P.  and Liu, Y.  and Noble, M. S.  and Stojanov, P.  and Voet, D.  and Zhang, H.  and Zou, L.  and Stewart, C.  and Bernard, B.  and Bressler, R.  and Eakin, A.  and Iype, L.  and Knijnenburg, T.  and Kramer, R.  and Kreisberg, R.  and Leinonen, K.  and Lin, J.  and Liu, Y.  and Miller, M.  and Reynolds, S. M.  and Rovira, H.  and Shmulevich, I.  and Thorsson, V.  and Yang, D.  and Zhang, W.  and Amin, S.  and Wu, C. J.  and Wu, C. C.  and Akbani, R.  and Aldape, K.  and Baggerly, K. A.  and Broom, B.  and Casasent, T. D.  and Cleland, J.  and Creighton, C.  and Dodda, D.  and Edgerton, M.  and Han, L.  and Herbrich, S. M.  and Ju, Z.  and Kim, H.  and Lerner, S.  and Li, J.  and Liang, H.  and Liu, W.  and Lorenzi, P. L.  and Lu, Y.  and Melott, J.  and Mills, G. B.  and Nguyen, L.  and Su, X.  and Verhaak, R.  and Wang, W.  and Weinstein, J. N.  and Wong, A.  and Yang, Y.  and Yao, J.  and Yao, R.  and Yoshihara, K.  and Yuan, Y.  and Yung, A. K.  and Zhang, N.  and Zheng, S.  and Ryan, M.  and Kane, D. W.  and Aksoy, B. A.  and Ciriello, G.  and Dresdner, G.  and Gao, J.  and Gross, B.  and Jacobsen, A.  and Kahles, A.  and Ladanyi, M.  and Lee, W.  and Lehmann, K. V.  and Miller, M. L.  and Ramirez, R.  and Ratsch, G.  and Reva, B.  and Sander, C.  and Schultz, N.  and Senbabaoglu, Y.  and Shen, R.  and Sinha, R.  and Sumer, S. O.  and Sun, Y.  and Taylor, B. S.  and Weinhold, N.  and Fei, S.  and Spellman, P.  and Benz, C.  and Carlin, D.  and Cline, M.  and Craft, B.  and Ellrott, K.  and Goldman, M.  and Haussler, D.  and Ma, S.  and Ng, S.  and Paull, E.  and Radenbaugh, A.  and Salama, S.  and Sokolov, A.  and Stuart, J. M.  and Swatloski, T.  and Uzunangelov, V.  and Waltman, P.  and Yau, C.  and Zhu, J.  and Hamilton, S. R.  and Getz, G.  and Sougnez, C.  and Abbott, S.  and Abbott, R.  and Dees, N. D.  and Delehaunty, K.  and Ding, L.  and Dooling, D. J.  and Eldred, J. M.  and Fronick, C. C.  and Fulton, R.  and Fulton, L. L.  and Kalicki-Veizer, J.  and Kanchi, K. L.  and Kandoth, C.  and Koboldt, D. C.  and Larson, D. E.  and Ley, T. J.  and Lin, L.  and Lu, C.  and Magrini, V. J.  and Mardis, E. R.  and McLellan, M. D.  and McMichael, J. F.  and Miller, C. A.  and O'Laughlin, M.  and Pohl, C.  and Schmidt, H.  and Smith, S. M.  and Walker, J.  and Wallis, J. W.  and Wendl, M. C.  and Wilson, R. K.  and Wylie, T.  and Zhang, Q.  and Burton, R.  and Jensen, M. A.  and Kahn, A.  and Pihl, T.  and Pot, D.  and Wan, Y.  and Levine, D. A.  and Black, A. D.  and Bowen, J.  and Frick, J.  and Gastier-Foster, J. M.  and Harper, H. A.  and Helsel, C.  and Leraas, K. M.  and Lichtenberg, T. M.  and McAllister, C.  and Ramirez, N. C.  and Sharpe, S.  and Wise, L.  and Zmuda, E.  and Chanock, S. J.  and Davidsen, T.  and Demchok, J. A.  and Eley, G.  and Felau, I.  and Ozenberger, B. A.  and Sheth, M.  and Sofia, H.  and Staudt, L.  and Tarnuzzer, R.  and Wang, Z.  and Yang, L.  and Zhang, J.  and Omberg, L.  and Margolin, A.  and Raphael, B. J.  and Vandin, F.  and Wu, H. T.  and Leiserson, M. D.  and Benz, S. C.  and Vaske, C. J.  and Noushmehr, H.  and Knijnenburg, T.  and Wolf, D.  and Van 't Veer, L.  and Collisson, E. A.  and Anastassiou, D.  and Ou Yang, T. H.  and Lopez-Bigas, N.  and Gonzalez-Perez, A.  and Tamborero, D.  and Xia, Z.  and Li, W.  and Cho, D. Y.  and Przytycka, T.  and Hamilton, M.  and McGuire, S.  and Nelander, S.  and Johansson, P.  and Jornsten, R.  and Kling, T.  and Sanchez, J. ",
   Title="{{T}he {C}ancer {G}enome {A}tlas {P}an-{C}ancer analysis project}",
   Journal="Nat. Genet.",
   Year="2013",
   Volume="45",
   Number="10",
   Pages="1113--1120",
   Month="Oct"
}

@Article{cmscc,
   Author="Guinney, J.  and Dienstmann, R.  and Wang, X.  and de Reynies, A.  and Schlicker, A.  and Soneson, C.  and Marisa, L.  and Roepman, P.  and Nyamundanda, G.  and Angelino, P.  and Bot, B. M.  and Morris, J. S.  and Simon, I. M.  and Gerster, S.  and Fessler, E.  and De Sousa E Melo, F.  and Missiaglia, E.  and Ramay, H.  and Barras, D.  and Homicsko, K.  and Maru, D.  and Manyam, G. C.  and Broom, B.  and Boige, V.  and Perez-Villamil, B.  and Laderas, T.  and Salazar, R.  and Gray, J. W.  and Hanahan, D.  and Tabernero, J.  and Bernards, R.  and Friend, S. H.  and Laurent-Puig, P.  and Medema, J. P.  and Sadanandam, A.  and Wessels, L.  and Delorenzi, M.  and Kopetz, S.  and Vermeulen, L.  and Tejpar, S. ",
   Title="{{T}he consensus molecular subtypes of colorectal cancer}",
   Journal="Nat. Med.",
   Year="2015",
   Volume="21",
   Number="11",
   Pages="1350--1356",
   Month="Nov"
}

@Article{go_latest_paper,
   Author="The Gene Ontology Consortium",
   Title="{{E}xpansion of the {G}ene {O}ntology knowledgebase and resources}",
   Journal="Nucleic Acids Res.",
   Year="2017",
   Volume="45",
   Number="D1",
   Pages="D331-D338",
   Month="Jan"
}

@Article{go_first_paper,
   Author="Ashburner, M.  and Ball, C. A.  and Blake, J. A.  and Botstein, D.  and Butler, H.  and Cherry, J. M.  and Davis, A. P.  and Dolinski, K.  and Dwight, S. S.  and Eppig, J. T.  and Harris, M. A.  and Hill, D. P.  and Issel-Tarver, L.  and Kasarskis, A.  and Lewis, S.  and Matese, J. C.  and Richardson, J. E.  and Ringwald, M.  and Rubin, G. M.  and Sherlock, G. ",
   Title="{{G}ene ontology: tool for the unification of biology. {T}he {G}ene {O}ntology {C}onsortium}",
   Journal="Nat. Genet.",
   Year="2000",
   Volume="25",
   Number="1",
   Pages="25--29",
   Month="May"
}

@Article{reactome_latent_paper,
   Author="Fabregat, A.  and Jupe, S.  and Matthews, L.  and Sidiropoulos, K.  and Gillespie, M.  and Garapati, P.  and Haw, R.  and Jassal, B.  and Korninger, F.  and May, B.  and Milacic, M.  and Roca, C. D.  and Rothfels, K.  and Sevilla, C.  and Shamovsky, V.  and Shorser, S.  and Varusai, T.  and Viteri, G.  and Weiser, J.  and Wu, G.  and Stein, L.  and Hermjakob, H.  and D'Eustachio, P. ",
   Title="{{T}he {R}eactome {P}athway {K}nowledgebase}",
   Journal="Nucleic Acids Res.",
   Year="2018",
   Volume="46",
   Number="D1",
   Pages="D649-D655",
   Month="Jan"
}

@Article{kegg_latest_paper,
   Author="Kanehisa, M.  and Furumichi, M.  and Tanabe, M.  and Sato, Y.  and Morishima, K. ",
   Title="{{K}{E}{G}{G}: new perspectives on genomes, pathways, diseases and drugs}",
   Journal="Nucleic Acids Res.",
   Year="2017",
   Volume="45",
   Number="D1",
   Pages="D353-D361",
   Month="Jan"
}
@article{zhang_2014,
title = {{PePr}: a peak-calling prioritization pipeline to identify consistent or differential peaks from replicated {ChIP}-Seq data.},
author = {Zhang, Yanxiao and Lin, Yu-Hsuan and Johnson, Timothy D and Rozek, Laura S and Sartor, Maureen A},
pages = {2568-2575},
url = {http://dx.doi.org/10.1093/bioinformatics/btu372},
year = {2014},
month = {sep},
day = {15},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {30},
number = {18},
doi = {10.1093/bioinformatics/btu372},
pmid = {24894502},
pmcid = {PMC4155259},
abstract = {{MOTIVATION}: {ChIP}-Seq is the standard method to identify genome-wide {DNA}-binding sites for transcription factors ({TFs}) and histone modifications. There is a growing need to analyze experiments with biological replicates, especially for epigenomic experiments where variation among biological samples can be substantial. However, tools that can perform group comparisons are currently lacking. {RESULTS}: We present a peak-calling prioritization pipeline ({PePr}) for identifying consistent or differential binding sites in {ChIP}-Seq experiments with biological replicates. {PePr} models read counts across the genome among biological samples with a negative binomial distribution and uses a local variance estimation method, ranking consistent or differential binding sites more favorably than sites with greater variability. We compared {PePr} with commonly used and recently proposed approaches on eight {TF} datasets and show that {PePr} uniquely identifies consistent regions with enriched read counts, high motif occurrence rate and known characteristics of {TF} binding based on visual inspection. For histone modification data with broadly enriched regions, {PePr} identified differential regions that are consistent within groups and outperformed other methods in scaling False Discovery Rate ({FDR}) analysis. {AVAILABILITY} {AND} {IMPLEMENTATION}: http://code.google.com/p/pepr-chip-seq/. \copyright The Author 2014. Published by Oxford University Press. All rights reserved. For Permissions, please e-mail: journals.permissions@oup.com.}
}
@article{lun_2014,
title = {De novo detection of differentially bound regions for {ChIP}-seq data using peaks and windows: controlling error rates correctly.},
author = {Lun, Aaron T L and Smyth, Gordon K},
pages = {e95},
url = {http://dx.doi.org/10.1093/nar/gku351},
year = {2014},
month = {jun},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {42},
number = {11},
doi = {10.1093/nar/gku351},
pmid = {24852250},
pmcid = {PMC4066778},
f1000-projects = {{DiffBind} from Mendeley},
abstract = {A common aim in {ChIP}-seq experiments is to identify changes in protein binding patterns between conditions, i.e. differential binding. A number of peak- and window-based strategies have been developed to detect differential binding when the regions of interest are not known in advance. However, careful consideration of error control is needed when applying these methods. Peak-based approaches use the same data set to define peaks and to detect differential binding. Done improperly, this can result in loss of type I error control. For window-based methods, controlling the false discovery rate over all detected windows does not guarantee control across all detected regions. Misinterpreting the former as the latter can result in unexpected liberalness. Here, several solutions are presented to maintain error control for these de novo counting strategies. For peak-based methods, peak calling should be performed on pooled libraries prior to the statistical analysis. For window-based methods, a hybrid approach using Simes' method is proposed to maintain control of the false discovery rate across regions. More generally, the relative advantages of peak- and window-based strategies are explored using a range of simulated and real data sets. Implementations of both strategies also compare favourably to existing programs for differential binding analyses. \copyright The Author(s) 2014. Published by Oxford University Press on behalf of Nucleic Acids Research.}
}
@article{allhoff_2014,
title = {Detecting differential peaks in {ChIP}-seq signals with {ODIN}.},
author = {Allhoff, Manuel and Seré, Kristin and Chauvistré, Heike and Lin, Qiong and Zenke, Martin and Costa, Ivan G},
pages = {3467-3475},
url = {http://dx.doi.org/10.1093/bioinformatics/btu722},
year = {2014},
month = {dec},
day = {15},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {30},
number = {24},
doi = {10.1093/bioinformatics/btu722},
pmid = {25371479},
f1000-projects = {{DiffBind} from Mendeley},
abstract = {{MOTIVATION}: Detection of changes in deoxyribonucleic acid ({DNA})-protein interactions from {ChIP}-seq data is a crucial step in unraveling the regulatory networks behind biological processes. The simplest variation of this problem is the differential peak calling ({DPC}) problem. Here, one has to find genomic regions with {ChIP}-seq signal changes between two cellular conditions in the interaction of a protein with {DNA}. The great majority of peak calling methods can only analyze one {ChIP}-seq signal at a time and are unable to perform {DPC}. Recently, a few approaches based on the combination of these peak callers with statistical tests for detecting differential digital expression have been proposed. However, these methods fail to detect detailed changes of protein-{DNA} interactions. {RESULTS}: We propose an One-stage {DIffereNtial} peak caller ({ODIN}); an Hidden Markov Model-based approach to detect and analyze differential peaks ({DPs}) in pairs of {ChIP}-seq data. {ODIN} performs genomic signal processing, peak calling and p-value calculation in an integrated framework. We also propose an evaluation methodology to compare {ODIN} with competing methods. The evaluation method is based on the association of {DPs} with expression changes in the same cellular conditions. Our empirical study based on several {ChIP}-seq experiments from transcription factors, histone modifications and simulated data shows that {ODIN} outperforms considered competing methods in most scenarios. \copyright The Author 2014. Published by Oxford University Press. All rights reserved. For Permissions, please e-mail: journals.permissions@oup.com.}
}
@article{allhoff_2016,
title = {Differential peak calling of {ChIP}-seq signals with replicates with {THOR}.},
author = {Allhoff, Manuel and Seré, Kristin and F Pires, Juliana and Zenke, Martin and G Costa, Ivan},
pages = {e153},
url = {http://dx.doi.org/10.1093/nar/gkw680},
year = {2016},
month = {nov},
day = {16},
urldate = {2016-11-30},
journal = {Nucleic Acids Res},
volume = {44},
number = {20},
doi = {10.1093/nar/gkw680},
pmid = {27484474},
pmcid = {PMC5175345},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {The study of changes in protein-{DNA} interactions measured by {ChIP}-seq on dynamic systems, such as cell differentiation, response to treatments or the comparison of healthy and diseased individuals, is still an open challenge. There are few computational methods comparing changes in {ChIP}-seq signals with replicates. Moreover, none of these previous approaches addresses {ChIP}-seq specific experimental artefacts arising from studies with biological replicates. We propose {THOR}, a Hidden Markov Model based approach, to detect differential peaks between pairs of biological conditions with replicates. {THOR} provides all pre- and post-processing steps required in {ChIP}-seq analyses. Moreover, we propose a novel normalization approach based on housekeeping genes to deal with cases where replicates have distinct signal-to-noise ratios. To evaluate differential peak calling methods, we delineate a methodology using both biological and simulated data. This includes an evaluation procedure that associates differential peaks with changes in gene expression as well as histone modifications close to these peaks. We evaluate {THOR} and seven competing methods on data sets with distinct characteristics from in vitro studies with technical replicates to clinical studies of cancer patients. Our evaluation analysis comprises of 13 comparisons between pairs of biological conditions. We show that {THOR} performs best in all scenarios. \copyright The Author(s) 2016. Published by Oxford University Press on behalf of Nucleic Acids Research.}
}

@Manual{MotifRG,
    title = {motifRG: A package for discriminative motif discovery, designed for high
throughput sequencing dataset},
    author = {Zizhen Yao},
    year = {2012},
    note = {R package version 1.24.0},
  }
@article{khan_2018,
title = {{JASPAR} 2018: update of the open-access database of transcription factor binding profiles and its web framework.},
author = {Khan, Aziz and Fornes, Oriol and Stigliani, Arnaud and Gheorghe, Marius and Castro-Mondragon, Jaime A and van der Lee, Robin and Bessy, Adrien and Chèneby, Jeanne and Kulkarni, Shubhada R and Tan, Ge and Baranasic, Damir and Arenillas, David J and Sandelin, Albin and Vandepoele, Klaas and Lenhard, Boris and Ballester, Benoît and Wasserman, Wyeth W and Parcy, François and Mathelier, Anthony},
pages = {D260-D266},
url = {http://dx.doi.org/10.1093/nar/gkx1126},
year = {2018},
month = {jan},
day = {4},
urldate = {2018-07-14},
journal = {Nucleic Acids Res},
volume = {46},
number = {D1},
doi = {10.1093/nar/gkx1126},
pmid = {29140473},
pmcid = {PMC5753243},
f1000-projects = {Motif Discovery},
abstract = {{JASPAR} (http://jaspar.genereg.net) is an open-access database of curated, non-redundant transcription factor ({TF})-binding profiles stored as position frequency matrices ({PFMs}) and {TF} flexible models ({TFFMs}) for {TFs} across multiple species in six taxonomic groups. In the 2018 release of {JASPAR}, the {CORE} collection has been expanded with 322 new {PFMs} (60 for vertebrates and 262 for plants) and 33 {PFMs} were updated (24 for vertebrates, 8 for plants and 1 for insects). These new profiles represent a 30\% expansion compared to the 2016 release. In addition, we have introduced 316 {TFFMs} (95 for vertebrates, 218 for plants and 3 for insects). This release incorporates clusters of similar {PFMs} in each taxon and each {TF} class per taxon. The {JASPAR} 2018 {CORE} vertebrate collection of {PFMs} was used to predict {TF}-binding sites in the human genome. The predictions are made available to the scientific community through a {UCSC} Genome Browser track data hub. Finally, this update comes with a new web framework with an interactive and responsive user-interface, along with new features. All the underlying data can be retrieved programmatically using a {RESTful} {API} and through the {JASPAR} 2018 R/Bioconductor package. \copyright The Author(s) 2017. Published by Oxford University Press on behalf of Nucleic Acids Research.}
}
@Article{Gu_2016,
    title = {Complex heatmaps reveal patterns and correlations in multidimensional genomic data},
    author = {Zuguang Gu and Roland Eils and Matthias Schlesner},
    journal = {Bioinformatics},
    year = {2016},
  }

  
@Article{Wardle_2015,
AUTHOR = { Wardle, FC and Tan, H},
TITLE = {A ChIP on the shoulder? Chromatin immunoprecipitation and validation strategies for ChIP antibodies [version 1; referees: 2 approved]
},
JOURNAL = {F1000Research},
VOLUME = {4},
YEAR = {2015},
NUMBER = {235},
DOI = {10.12688/f1000research.6719.1}
}
@article{angelini_2015,
title = {Is this the right normalization? A diagnostic tool for {ChIP}-seq normalization.},
author = {Angelini, Claudia and Heller, Ruth and Volkinshtein, Rita and Yekutieli, Daniel},
pages = {150},
url = {http://dx.doi.org/10.1186/s12859-015-0579-z},
year = {2015},
month = {may},
day = {9},
urldate = {2018-07-14},
journal = {{BMC} Bioinformatics},
volume = {16},
doi = {10.1186/s12859-015-0579-z},
pmid = {25957089},
pmcid = {PMC4448883},
f1000-projects = {Normalization from Mendeley},
abstract = {{BACKGROUND}: Chip-seq experiments are becoming a standard approach for genome-wide profiling protein-{DNA} interactions, such as detecting transcription factor binding sites, histone modification marks and {RNA} Polymerase {II} occupancy. However, when comparing a {ChIP} sample versus a control sample, such as Input {DNA}, normalization procedures have to be applied in order to remove experimental source of biases. Despite the substantial impact that the choice of the normalization method can have on the results of a {ChIP}-seq data analysis, their assessment is not fully explored in the literature. In particular, there are no diagnostic tools that show whether the applied normalization is indeed appropriate for the data being analyzed. {RESULTS}: In this work we propose a novel diagnostic tool to examine the appropriateness of the estimated normalization procedure. By plotting the empirical densities of log relative risks in bins of equal read count, along with the estimated normalization constant, after logarithmic transformation, the researcher is able to assess the appropriateness of the estimated normalization constant. We use the diagnostic plot to evaluate the appropriateness of the estimates obtained by {CisGenome}, {NCIS} and {CCAT} on several real data examples. Moreover, we show the impact that the choice of the normalization constant can have on standard tools for peak calling such as {MACS} or {SICER}. Finally, we propose a novel procedure for controlling the {FDR} using sample swapping. This procedure makes use of the estimated normalization constant in order to gain power over the naive choice of constant (used in {MACS} and {SICER}), which is the ratio of the total number of reads in the {ChIP} and Input samples. {CONCLUSIONS}: Linear normalization approaches aim to estimate a scale factor, r, to adjust for different sequencing depths when comparing {ChIP} versus Input samples. The estimated scaling factor can easily be incorporated in many peak caller algorithms to improve the accuracy of the peak identification. The diagnostic plot proposed in this paper can be used to assess how adequate {ChIP}/Input normalization constants are, and thus it allows the user to choose the most adequate estimate for the analysis.}
}
@article{shao_2012,
title = {{MAnorm}: a robust model for quantitative comparison of {ChIP}-Seq data sets.},
author = {Shao, Zhen and Zhang, Yijing and Yuan, Guo-Cheng and Orkin, Stuart H and Waxman, David J},
pages = {R16},
url = {http://dx.doi.org/10.1186/gb-2012-13-3-r16},
year = {2012},
month = {mar},
day = {16},
urldate = {2016-10-24},
journal = {Genome Biol},
volume = {13},
number = {3},
doi = {10.1186/gb-2012-13-3-r16},
pmid = {22424423},
pmcid = {PMC3439967},
f1000-projects = {Normalization from Mendeley},
abstract = {{ChIP}-Seq is widely used to characterize genome-wide binding patterns of transcription factors and other chromatin-associated proteins. Although comparison of {ChIP}-Seq data sets is critical for understanding cell type-dependent and cell state-specific binding, and thus the study of cell-specific gene regulation, few quantitative approaches have been developed. Here, we present a simple and effective method, {MAnorm}, for quantitative comparison of {ChIP}-Seq data sets describing transcription factor binding sites and epigenetic modifications. The quantitative binding differences inferred by {MAnorm} showed strong correlation with both the changes in expression of target genes and the binding of cell type-specific regulators.}
}
@Article{Teng_2017,
    title = {Accounting for GC-content bias reduces systematic errors and batch effects in ChIP-Seq data.},
    author = {Mingxiang Teng and Rafael A. Irizarry},
    journal = {Genome Research},
    doi = {10.1101/gr.220673.117},
    year = {2017},
  }
@Article{TFBSTools,
    title = {TFBSTools: an R/Bioconductor package for transcription factor binding site analysis},
    author = {Ge Tan and Boris Lenhard},
    year = {2016},
    journal = {Bioinformatics},
    volume = {32},
    issue = {10},
    pages = {1555-1556},
    doi = {10.1093/bioinformatics/btw024},
    url = {http://bioinformatics.oxfordjournals.org/content/32/10/1555},
  }
@article{langmead_2009,
title = {Ultrafast and memory-efficient alignment of short {DNA} sequences to the human genome.},
author = {Langmead, Ben and Trapnell, Cole and Pop, Mihai and Salzberg, Steven L},
pages = {R25},
url = {http://dx.doi.org/10.1186/gb-2009-10-3-r25},
year = {2009},
month = {mar},
day = {4},
urldate = {2016-11-18},
journal = {Genome Biol},
volume = {10},
number = {3},
doi = {10.1186/gb-2009-10-3-r25},
pmid = {19261174},
pmcid = {PMC2690996},
abstract = {Bowtie is an ultrafast, memory-efficient alignment program for aligning short {DNA} sequence reads to large genomes. For the human genome, Burrows-Wheeler indexing allows Bowtie to align more than 25 million reads per {CPU} hour with a memory footprint of approximately 1.3 gigabytes. Bowtie extends previous Burrows-Wheeler techniques with a novel quality-aware backtracking algorithm that permits mismatches. Multiple processor cores can be used simultaneously to achieve even greater alignment speeds. Bowtie is open source (http://bowtie.cbcb.umd.edu).}
}
@article{langmead_2012,
title = {Fast gapped-read alignment with Bowtie 2.},
author = {Langmead, Ben and Salzberg, Steven L},
pages = {357-359},
url = {http://dx.doi.org/10.1038/nmeth.1923},
year = {2012},
month = {mar},
day = {4},
urldate = {2016-04-25},
journal = {Nat Methods},
volume = {9},
number = {4},
doi = {10.1038/nmeth.1923},
pmid = {22388286},
pmcid = {PMC3322381},
f1000-projects = {Methods from Mendeley and {RNA}-seq (Mendeley group)},
abstract = {As the rate of sequencing increases, greater throughput is demanded from read aligners. The full-text minute index is often used to make alignment very fast and memory-efficient, but the approach is ill-suited to finding longer, gapped alignments. Bowtie 2 combines the strengths of the full-text minute index with the flexibility and speed of hardware-accelerated dynamic programming algorithms to achieve a combination of high speed, sensitivity and accuracy.}
}
@article{li_2009,
title = {Fast and accurate short read alignment with Burrows-Wheeler transform.},
author = {Li, Heng and Durbin, Richard},
pages = {1754-1760},
url = {http://dx.doi.org/10.1093/bioinformatics/btp324},
year = {2009},
month = {jul},
day = {15},
urldate = {2017-12-03},
journal = {Bioinformatics},
volume = {25},
number = {14},
doi = {10.1093/bioinformatics/btp324},
pmid = {19451168},
pmcid = {PMC2705234},
abstract = {{MOTIVATION}: The enormous amount of short reads generated by the new {DNA} sequencing technologies call for the development of fast and accurate read alignment programs. A first generation of hash table-based methods has been developed, including {MAQ}, which is accurate, feature rich and fast enough to align short reads from a single individual. However, {MAQ} does not support gapped alignment for single-end reads, which makes it unsuitable for alignment of longer reads where indels may occur frequently. The speed of {MAQ} is also a concern when the alignment is scaled up to the resequencing of hundreds of individuals. {RESULTS}: We implemented Burrows-Wheeler Alignment tool ({BWA}), a new read alignment package that is based on backward search with Burrows-Wheeler Transform ({BWT}), to efficiently align short sequencing reads against a large reference sequence such as the human genome, allowing mismatches and gaps. {BWA} supports both base space reads, e.g. from Illumina sequencing machines, and color space reads from {AB} {SOLiD} machines. Evaluations on both simulated and real data suggest that {BWA} is approximately 10-20x faster than {MAQ}, while achieving similar accuracy. In addition, {BWA} outputs alignment in the new standard {SAM} (Sequence Alignment/Map) format. Variant calling and other downstream analyses after the alignment can be achieved with the open source {SAMtools} software package. {AVAILABILITY}: http://maq.sourceforge.net.}
}
@article{ruffalo_2011,
title = {Comparative analysis of algorithms for next-generation sequencing read alignment.},
author = {Ruffalo, Matthew and {LaFramboise}, Thomas and Koyutürk, Mehmet},
pages = {2790-2796},
url = {http://dx.doi.org/10.1093/bioinformatics/btr477},
year = {2011},
month = {oct},
day = {15},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {27},
number = {20},
doi = {10.1093/bioinformatics/btr477},
pmid = {21856737},
abstract = {{MOTIVATION}: The advent of next-generation sequencing ({NGS}) techniques presents many novel opportunities for many applications in life sciences. The vast number of short reads produced by these techniques, however, pose significant computational challenges. The first step in many types of genomic analysis is the mapping of short reads to a reference genome, and several groups have developed dedicated algorithms and software packages to perform this function. As the developers of these packages optimize their algorithms with respect to various considerations, the relative merits of different software packages remain unclear. However, for scientists who generate and use {NGS} data for their specific research projects, an important consideration is choosing the software that is most suitable for their application. {RESULTS}: With a view to comparing existing short read alignment software, we develop a simulation and evaluation suite, Seal, which simulates {NGS} runs for different configurations of various factors, including sequencing error, indels and coverage. We also develop criteria to compare the performances of software with disparate output structure (e.g. some packages return a single alignment while some return multiple possible alignments). Using these criteria, we comprehensively evaluate the performances of Bowtie, {BWA}, mr- and {mrsFAST}, Novoalign, {SHRiMP} and {SOAPv2}, with regard to accuracy and runtime. {CONCLUSION}: We expect that the results presented here will be useful to investigators in choosing the alignment software that is most suitable for their specific research aims. Our results also provide insights into the factors that should be considered to use alignment results effectively. Seal can also be used to evaluate the performance of algorithms that use deep sequencing data for various purposes (e.g. identification of genomic variants). {AVAILABILITY}: Seal is available as open source at http://compbio.case.edu/seal/. {CONTACT}: matthew.ruffalo@case.edu {SUPPLEMENTARY} {INFORMATION}: Supplementary data are available at Bioinformatics online.}
}
@article{li_2014,
title = {Mappability and read length.},
author = {Li, Wentian and Freudenberg, Jan},
pages = {381},
url = {http://dx.doi.org/10.3389/fgene.2014.00381},
year = {2014},
month = {nov},
day = {10},
urldate = {2018-07-14},
journal = {Front Genet},
volume = {5},
doi = {10.3389/fgene.2014.00381},
pmid = {25426137},
pmcid = {PMC4226227},
f1000-projects = {Mapping from Mendeley},
abstract = {Power-law distributions are the main functional form for the distribution of repeat size and repeat copy number in the human genome. When the genome is broken into fragments for sequencing, the limited size of fragments and reads may prevent an unique alignment of repeat sequences to the reference sequence. Repeats in the human genome can be as long as 10(4) bases, or 10(5) - 10(6) bases when allowing for mismatches between repeat units. Sequence reads from these regions are therefore unmappable when the read length is in the range of 10(3) bases. With a read length of 1000 bases, slightly more than 1\% of the assembled genome, and slightly less than 1\% of the 1 kb reads, are unmappable, excluding the unassembled portion of the human genome (8\% in {GRCh37}/hg19). The slow decay (long tail) of the power-law function implies a diminishing return in converting unmappable regions/reads to become mappable with the increase of the read length, with the understanding that increasing read length will always move toward the direction of 100\% mappability.}
}


@article{helmuth_2016,
title = {{normR}: Regime enrichment calling for {ChIP}-seq data},
author = {Helmuth, Johannes and Li, Na and Arrigoni, Laura and Gianmoena, Kathrin and Cadenas, Cristina and Gasparoni, Gilles and Sinha, Anupam and Rosenstiel, Philip and Walter, Jörn and Hengstler, Jan G. and Manke, Thomas and Chung, Ho-Ryun},
year = {2016},
urldate = {2016-10-21},
journal = {bioRxiv},
f1000-projects = {{ChipSeq} from Mendeley}
}

@article{micsinai_2012,
title = {Picking {ChIP}-seq peak detectors for analyzing chromatin modification experiments.},
author = {Micsinai, Mariann and Parisi, Fabio and Strino, Francesco and Asp, Patrik and Dynlacht, Brian D and Kluger, Yuval},
pages = {e70},
url = {http://dx.doi.org/10.1093/nar/gks048},
year = {2012},
month = {may},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {40},
number = {9},
doi = {10.1093/nar/gks048},
pmid = {22307239},
pmcid = {PMC3351193},
f1000-projects = {Expression from Mendeley and Reviews from Mendeley},
abstract = {Numerous algorithms have been developed to analyze {ChIP}-Seq data. However, the complexity of analyzing diverse patterns of {ChIP}-Seq signals, especially for epigenetic marks, still calls for the development of new algorithms and objective comparisons of existing methods. We developed Qeseq, an algorithm to detect regions of increased {ChIP} read density relative to background. Qeseq employs critical novel elements, such as iterative recalibration and neighbor joining of reads to identify enriched regions of any length. To objectively assess its performance relative to other 14 {ChIP}-Seq peak finders, we designed a novel protocol based on Validation Discriminant Analysis ({VDA}) to optimally select validation sites and generated two validation datasets, which are the most comprehensive to date for algorithmic benchmarking of key epigenetic marks. In addition, we systematically explored a total of 315 diverse parameter configurations from these algorithms and found that typically optimal parameters in one dataset do not generalize to other datasets. Nevertheless, default parameters show the most stable performance, suggesting that they should be used. This study also provides a reproducible and generalizable methodology for unbiased comparative analysis of high-throughput sequencing tools that can facilitate future algorithmic development.}
}

@article{felsani_2015,
title = {Impact of different {ChIP}-Seq protocols on {DNA} integrity and quality of bioinformatics analysis results.},
author = {Felsani, Armando and Gudmundsson, Bjarki and Nanni, Simona and Brini, Elena and Moles, Anna and Thormar, Hans Guttormur and Estibeiro, Peter and Gaetano, Carlo and Capogrossi, Maurizio and Farsetti, Antonella and Jonsson, Jon Johannes and Guffanti, Alessandro},
pages = {156-162},
url = {http://dx.doi.org/10.1093/bfgp/elu001},
year = {2015},
month = {mar},
urldate = {2016-10-24},
journal = {Brief Funct Genomics},
volume = {14},
number = {2},
doi = {10.1093/bfgp/elu001},
pmid = {24562761},
f1000-projects = {Protocols from Mendeley},
abstract = {Different {ChIP}-Seq protocols may have a significant impact on the final outcome in terms of quality, number and distribution of called peaks. Sample {DNA} undergoes a long procedure before the final sequencing step, and damaged {DNA} can result in excessive mismatches in the alignment with reference genome. In this letter, we present the effect of well-defined modifications (timing of formaldehyde crosslink reversal, brand of the sonicator) of standard {ChIP}-Seq protocol on parallel samples derived from the same cell line correlating the initial {DNA} quality control metrics to the final bioinformatics analysis results. \copyright The Author 2014. Published by Oxford University Press. All rights reserved. For permissions, please email: journals.permissions@oup.com.}
}

@article{teng_2016,
title = {Accounting for {GC}-content bias reduces systematic errors and batch effects in {ChIP}-Seq peak callers},
author = {Teng, Mingxiang and Irizarry, Rafael A},
url = {http://biorxiv.org/content/early/2016/11/30/090704},
year = {2016},
month = {jan},
day = {1},
urldate = {2016-12-02},
journal = {bioRxiv},
f1000-projects = {{PeakCalling} from Mendeley}
}

@article{benjamini_2012,
title = {Summarizing and correcting the {GC} content bias in high-throughput sequencing.},
author = {Benjamini, Yuval and Speed, Terence P},
pages = {e72},
url = {http://dx.doi.org/10.1093/nar/gks001},
year = {2012},
month = {may},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {40},
number = {10},
doi = {10.1093/nar/gks001},
pmid = {22323520},
pmcid = {PMC3378858},
f1000-projects = {Normalization from Mendeley and {QC} from Mendeley},
abstract = {{GC} content bias describes the dependence between fragment count (read coverage) and {GC} content found in Illumina sequencing data. This bias can dominate the signal of interest for analyses that focus on measuring fragment abundance within a genome, such as copy number estimation ({DNA}-seq). The bias is not consistent between samples; and there is no consensus as to the best methods to remove it in a single sample. We analyze regularities in the {GC} bias patterns, and find a compact description for this unimodal curve family. It is the {GC} content of the full {DNA} fragment, not only the sequenced read, that most influences fragment count. This {GC} effect is unimodal: both {GC}-rich fragments and {AT}-rich fragments are underrepresented in the sequencing results. This empirical evidence strengthens the hypothesis that {PCR} is the most important cause of the {GC} bias. We propose a model that produces predictions at the base pair level, allowing strand-specific {GC}-effect correction regardless of the downstream smoothing or binning. These {GC} modeling considerations can inform other high-throughput sequencing analyses such as {ChIP}-seq and {RNA}-seq.}
}

@article{aird_2011,
title = {Analyzing and minimizing {PCR} amplification bias in Illumina sequencing libraries.},
author = {Aird, Daniel and Ross, Michael G and Chen, Wei-Sheng and Danielsson, Maxwell and Fennell, Timothy and Russ, Carsten and Jaffe, David B and Nusbaum, Chad and Gnirke, Andreas},
pages = {R18},
url = {http://dx.doi.org/10.1186/gb-2011-12-2-r18},
year = {2011},
month = {feb},
day = {21},
urldate = {2016-10-24},
journal = {Genome Biol},
volume = {12},
number = {2},
issn = {1465-6914},
doi = {10.1186/gb-2011-12-2-r18},
pmid = {21338519},
pmcid = {PMC3188800},
f1000-projects = {Protocols from Mendeley},
abstract = {Despite the ever-increasing output of Illumina sequencing data, loci with extreme base compositions are often under-represented or absent. To evaluate sources of base-composition bias, we traced genomic sequences ranging from 6\% to 90\% {GC} through the process by quantitative {PCR}. We identified {PCR} during library preparation as a principal source of bias and optimized the conditions. Our improved protocol significantly reduces amplification bias and minimizes the previously severe effects of {PCR} instrument and temperature ramp rate.}
}

@article{teytelman_2013,
title = {Highly expressed loci are vulnerable to misleading {ChIP} localization of multiple unrelated proteins.},
author = {Teytelman, Leonid and Thurtle, Deborah M and Rine, Jasper and van Oudenaarden, Alexander},
pages = {18602-18607},
url = {http://dx.doi.org/10.1073/pnas.1316064110},
year = {2013},
month = {nov},
day = {12},
urldate = {2016-10-24},
journal = {Proc Natl Acad Sci {USA}},
volume = {110},
number = {46},
doi = {10.1073/pnas.1316064110},
pmid = {24173036},
pmcid = {PMC3831989},
f1000-projects = {Protocols from Mendeley},
abstract = {Chromatin immunoprecipitation ({ChIP}) is the gold-standard technique for localizing nuclear proteins in the genome. We used {ChIP}, in combination with deep sequencing (Seq), to study the genome-wide distribution of the Silent information regulator (Sir) complex in Saccharomyces cerevisiae. We analyzed {ChIP}-Seq peaks of the Sir2, Sir3, and Sir4 silencing proteins and discovered 238 unexpected euchromatic loci that exhibited enrichment of all three. Surprisingly, published {ChIP}-Seq datasets for the Ste12 transcription factor and the centromeric Cse4 protein indicated that these proteins were also enriched in the same euchromatic regions with the high Sir protein levels. The 238 loci, termed "hyper-{ChIPable}", were in highly expressed regions with strong polymerase {II} and polymerase {III} enrichment signals, and the correlation between transcription level and {ChIP} enrichment was not limited to these 238 loci but extended genome-wide. The apparent enrichment of various proteins at hyper-{ChIPable} loci was not a consequence of artifacts associated with deep sequencing methods, as confirmed by {ChIP}-quantitative {PCR}. The localization of unrelated proteins, including the entire silencing complex, to the most highly transcribed genes was highly suggestive of a technical issue with the immunoprecipitations. {ChIP}-Seq on chromatin immunoprecipitated with a nuclear-localized {GFP} reproduced the above enrichment in an expression-dependent manner: induction of the {GAL} genes resulted in an increased {ChIP} signal of the {GFP} protein at these loci, with presumably no biological relevance. Whereas {ChIP} is a broadly valuable technique, some published conclusions based upon {ChIP} procedures may merit reevaluation in light of these findings.}
}

@article{gavrilov_2015,
title = {In vivo formaldehyde cross-linking: it is time for black box analysis.},
author = {Gavrilov, Alexey and Razin, Sergey V and Cavalli, Giacomo},
pages = {163-165},
url = {http://dx.doi.org/10.1093/bfgp/elu037},
year = {2015},
month = {mar},
urldate = {2016-10-24},
journal = {Brief Funct Genomics},
volume = {14},
number = {2},
doi = {10.1093/bfgp/elu037},
pmid = {25241225},
f1000-projects = {{PARP} from Mendeley and Protocols from Mendeley},
abstract = {Formaldehyde cross-linking is an important component of many technologies, including chromatin immunoprecipitation and chromosome conformation capture. The procedure remains empirical and poorly characterized, however, despite a long history of its use in research. Little is known about the specificity of in vivo cross-linking, its efficiency and chemical adducts induced by the procedure. It is time to search this black box. \copyright The Author 2014. Published by Oxford University Press.}
}

@article{park_2013,
title = {Widespread misinterpretable {ChIP}-seq bias in yeast.},
author = {Park, Daechan and Lee, Yaelim and Bhupindersingh, Gurvani and Iyer, Vishwanath R},
pages = {e83506},
url = {http://dx.plos.org/10.1371/journal.pone.0083506},
year = {2013},
month = {dec},
day = {9},
urldate = {2016-10-24},
journal = {{PLoS} {ONE}},
volume = {8},
number = {12},
issn = {1932-6203},
doi = {10.1371/journal.pone.0083506},
pmid = {24349523},
pmcid = {PMC3857294},
f1000-projects = {Protocols from Mendeley},
abstract = {Chromatin immunoprecipitation followed by sequencing ({ChIP}-seq) is widely used to detect genome-wide interactions between a protein of interest and {DNA} in vivo. Loci showing strong enrichment over adjacent background regions are typically considered to be sites of binding. Insufficient attention has been given to systematic artifacts inherent to the {ChIP}-seq procedure that might generate a misleading picture of protein binding to certain loci. We show here that unrelated transcription factors appear to consistently bind to the gene bodies of highly transcribed genes in yeast. Strikingly, several types of negative control experiments, including a protein that is not expected to bind chromatin, also showed similar patterns of strong binding within gene bodies. These false positive signals were evident across sequencing platforms and immunoprecipitation protocols, as well as in previously published datasets from other labs. We show that these false positive signals derive from high rates of transcription, and are inherent to the {ChIP} procedure, although they are exacerbated by sequencing library construction procedures. This expression bias is strong enough that a known transcriptional repressor like Tup1 can erroneously appear to be an activator. Another type of background bias stems from the inherent nucleosomal structure of chromatin, and can potentially make it seem like certain factors bind nucleosomes even when they don't. Our analysis suggests that a mock {ChIP} sample offers a better normalization control for the expression bias, whereas the {ChIP} input is more appropriate for the nucleosomal periodicity bias. While these controls alleviate the effect of the biases to some extent, they are unable to eliminate it completely. Caution is therefore warranted regarding the interpretation of data that seemingly show the association of various transcription and chromatin factors with highly transcribed genes in yeast.}
}

@article{jung_2014,
title = {Impact of sequencing depth in {ChIP}-seq experiments.},
author = {Jung, Youngsook L and Luquette, Lovelace J and Ho, Joshua W K and Ferrari, Francesco and Tolstorukov, Michael and Minoda, Aki and Issner, Robbyn and Epstein, Charles B and Karpen, Gary H and Kuroda, Mitzi I and Park, Peter J},
pages = {e74},
url = {http://dx.doi.org/10.1093/nar/gku178},
year = {2014},
month = {may},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {42},
number = {9},
doi = {10.1093/nar/gku178},
pmid = {24598259},
pmcid = {PMC4027199},
f1000-projects = {Protocols from Mendeley},
abstract = {In a chromatin immunoprecipitation followed by high-throughput sequencing ({ChIP}-seq) experiment, an important consideration in experimental design is the minimum number of sequenced reads required to obtain statistically significant results. We present an extensive evaluation of the impact of sequencing depth on identification of enriched regions for key histone modifications ({H3K4me3}, {H3K36me3}, {H3K27me3} and {H3K9me2}/me3) using deep-sequenced datasets in human and fly. We propose to define sufficient sequencing depth as the number of reads at which detected enrichment regions increase \textless 1\% for an additional million reads. Although the required depth depends on the nature of the mark and the state of the cell in each experiment, we observe that sufficient depth is often reached at \textless 20 million reads for fly. For human, there are no clear saturation points for the examined datasets, but our analysis suggests 40-50 million reads as a practical minimum for most marks. We also devise a mathematical model to estimate the sufficient depth and total genomic coverage of a mark. Lastly, we find that the five algorithms tested do not agree well for broad enrichment profiles, especially at lower depths. Our findings suggest that sufficient sequencing depth and an appropriate peak-calling algorithm are essential for ensuring robustness of conclusions derived from {ChIP}-seq data. \copyright The Author(s) 2014. Published by Oxford University Press.}
}

@article{krebs_2014,
title = {Optimization of transcription factor binding map accuracy utilizing knockout-mouse models.},
author = {Krebs, Wolfgang and Schmidt, Susanne V and Goren, Alon and De Nardo, Dominic and Labzin, Larisa and Bovier, Anton and Ulas, Thomas and Theis, Heidi and Kraut, Michael and Latz, Eicke and Beyer, Marc and Schultze, Joachim L},
pages = {13051-13060},
url = {http://dx.doi.org/10.1093/nar/gku1078},
year = {2014},
month = {dec},
day = {1},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {42},
number = {21},
doi = {10.1093/nar/gku1078},
pmid = {25378309},
pmcid = {PMC4245947},
f1000-projects = {Protocols from Mendeley},
abstract = {Genome-wide assessment of protein-{DNA} interaction by chromatin immunoprecipitation followed by massive parallel sequencing ({ChIP}-seq) is a key technology for studying transcription factor ({TF}) localization and regulation of gene expression. Signal-to-noise-ratio and signal specificity in {ChIP}-seq studies depend on many variables, including antibody affinity and specificity. Thus far, efforts to improve antibody reagents for {ChIP}-seq experiments have focused mainly on generating higher quality antibodies. Here we introduce {KOIN} (knockout implemented normalization) as a novel strategy to increase signal specificity and reduce noise by using {TF} knockout mice as a critical control for {ChIP}-seq data experiments. Additionally, {KOIN} can identify 'hyper {ChIPable} regions' as another source of false-positive signals. As the use of the {KOIN} algorithm reduces false-positive results and thereby prevents misinterpretation of {ChIP}-seq data, it should be considered as the gold standard for future {ChIP}-seq analyses, particularly when developing {ChIP}-assays with novel antibody reagents. \copyright The Author(s) 2014. Published by Oxford University Press on behalf of Nucleic Acids Research.}
}

@article{shao_2012,
title = {{MAnorm}: a robust model for quantitative comparison of {ChIP}-Seq data sets.},
author = {Shao, Zhen and Zhang, Yijing and Yuan, Guo-Cheng and Orkin, Stuart H and Waxman, David J},
pages = {R16},
url = {http://dx.doi.org/10.1186/gb-2012-13-3-r16},
year = {2012},
month = {mar},
day = {16},
urldate = {2016-10-24},
journal = {Genome Biol},
volume = {13},
number = {3},
doi = {10.1186/gb-2012-13-3-r16},
pmid = {22424423},
pmcid = {PMC3439967},
f1000-projects = {Normalization from Mendeley},
abstract = {{ChIP}-Seq is widely used to characterize genome-wide binding patterns of transcription factors and other chromatin-associated proteins. Although comparison of {ChIP}-Seq data sets is critical for understanding cell type-dependent and cell state-specific binding, and thus the study of cell-specific gene regulation, few quantitative approaches have been developed. Here, we present a simple and effective method, {MAnorm}, for quantitative comparison of {ChIP}-Seq data sets describing transcription factor binding sites and epigenetic modifications. The quantitative binding differences inferred by {MAnorm} showed strong correlation with both the changes in expression of target genes and the binding of cell type-specific regulators.}
}


@article{bonhoure_2014,
title = {Quantifying {ChIP}-seq data: a spiking method providing an internal reference for sample-to-sample normalization.},
author = {Bonhoure, Nicolas and Bounova, Gergana and Bernasconi, David and Praz, Viviane and Lammers, Fabienne and Canella, Donatella and Willis, Ian M and Herr, Winship and Hernandez, Nouria and Delorenzi, Mauro and {CycliX} Consortium},
pages = {1157-1168},
url = {http://dx.doi.org/10.1101/gr.168260.113},
year = {2014},
month = {jul},
urldate = {2016-10-24},
journal = {Genome Res},
volume = {24},
number = {7},
doi = {10.1101/gr.168260.113},
pmid = {24709819},
pmcid = {PMC4079971},
f1000-projects = {Normalization from Mendeley},
abstract = {Chromatin immunoprecipitation followed by deep sequencing ({ChIP}-seq) experiments are widely used to determine, within entire genomes, the occupancy sites of any protein of interest, including, for example, transcription factors, {RNA} polymerases, or histones with or without various modifications. In addition to allowing the determination of occupancy sites within one cell type and under one condition, this method allows, in principle, the establishment and comparison of occupancy maps in various cell types, tissues, and conditions. Such comparisons require, however, that samples be normalized. Widely used normalization methods that include a quantile normalization step perform well when factor occupancy varies at a subset of sites, but may miss uniform genome-wide increases or decreases in site occupancy. We describe a spike adjustment procedure ({SAP}) that, unlike commonly used normalization methods intervening at the analysis stage, entails an experimental step prior to immunoprecipitation. A constant, low amount from a single batch of chromatin of a foreign genome is added to the experimental chromatin. This "spike" chromatin then serves as an internal control to which the experimental signals can be adjusted. We show that the method improves similarity between replicates and reveals biological differences including global and largely uniform changes. \copyright 2014 Bonhoure et al.; Published by Cold Spring Harbor Laboratory Press.}
}

@article{chen_2012,
title = {Systematic evaluation of factors influencing {ChIP}-seq fidelity.},
author = {Chen, Yiwen and Negre, Nicolas and Li, Qunhua and Mieczkowska, Joanna O and Slattery, Matthew and Liu, Tao and Zhang, Yong and Kim, Tae-Kyung and He, Housheng Hansen and Zieba, Jennifer and Ruan, Yijun and Bickel, Peter J and Myers, Richard M and Wold, Barbara J and White, Kevin P and Lieb, Jason D and Liu, X Shirley},
pages = {609-614},
url = {http://dx.doi.org/10.1038/nmeth.1985},
year = {2012},
month = {jun},
urldate = {2016-10-24},
journal = {Nat Methods},
volume = {9},
number = {6},
doi = {10.1038/nmeth.1985},
pmid = {22522655},
pmcid = {PMC3477507},
f1000-projects = {Reviews from Mendeley},
abstract = {We evaluated how variations in sequencing depth and other parameters influence interpretation of chromatin immunoprecipitation-sequencing ({ChIP}-seq) experiments. Using Drosophila melanogaster S2 cells, we generated {ChIP}-seq data sets for a site-specific transcription factor (Suppressor of Hairy-wing) and a histone modification ({H3K36me3}). We detected a chromatin-state bias: open chromatin regions yielded higher coverage, which led to false positives if not corrected. This bias had a greater effect on detection specificity than any base-composition bias. Paired-end sequencing revealed that single-end data underestimated {ChIP}-library complexity at high coverage. Removal of reads originating at the same base reduced false-positives but had little effect on detection sensitivity. Even at mappable-genome coverage depth of ∼1 read per base pair, ∼1\% of the narrow peaks detected on a tiling array were missed by {ChIP}-seq. Evaluation of widely used {ChIP}-seq analysis tools suggests that adjustments or algorithm improvements are required to handle data sets with deep coverage.}
}

@article{landt_2012,
title = {{ChIP}-seq guidelines and practices of the {ENCODE} and {modENCODE} consortia.},
author = {Landt, Stephen G and Marinov, Georgi K and Kundaje, Anshul and Kheradpour, Pouya and Pauli, Florencia and Batzoglou, Serafim and Bernstein, Bradley E and Bickel, Peter and Brown, James B and Cayting, Philip and Chen, Yiwen and {DeSalvo}, Gilberto and Epstein, Charles and Fisher-Aylor, Katherine I and Euskirchen, Ghia and Gerstein, Mark and Gertz, Jason and Hartemink, Alexander J and Hoffman, Michael M and Iyer, Vishwanath R and Jung, Youngsook L and Karmakar, Subhradip and Kellis, Manolis and Kharchenko, Peter V and Li, Qunhua and Liu, Tao and Liu, X Shirley and Ma, Lijia and Milosavljevic, Aleksandar and Myers, Richard M and Park, Peter J and Pazin, Michael J and Perry, Marc D and Raha, Debasish and Reddy, Timothy E and Rozowsky, Joel and Shoresh, Noam and Sidow, Arend and Slattery, Matthew and Stamatoyannopoulos, John A and Tolstorukov, Michael Y and White, Kevin P and Xi, Simon and Farnham, Peggy J and Lieb, Jason D and Wold, Barbara J and Snyder, Michael},
pages = {1813-1831},
url = {http://dx.doi.org/10.1101/gr.136184.111},
year = {2012},
month = {sep},
urldate = {2016-10-24},
journal = {Genome Res},
volume = {22},
number = {9},
doi = {10.1101/gr.136184.111},
pmid = {22955991},
pmcid = {PMC3431496},
f1000-projects = {Protocols from Mendeley},
abstract = {Chromatin immunoprecipitation ({ChIP}) followed by high-throughput {DNA} sequencing ({ChIP}-seq) has become a valuable and widely used approach for mapping the genomic location of transcription-factor binding and histone modifications in living cells. Despite its widespread use, there are considerable differences in how these experiments are conducted, how the results are scored and evaluated for quality, and how the data and metadata are archived for public use. These practices affect the quality and utility of any global {ChIP} experiment. Through our experience in performing {ChIP}-seq experiments, the {ENCODE} and {modENCODE} consortia have developed a set of working standards and guidelines for {ChIP} experiments that are updated routinely. The current guidelines address antibody validation, experimental replication, sequencing depth, data and metadata reporting, and data quality assessment. We discuss how {ChIP} quality, assessed in these ways, affects different uses of {ChIP}-seq data. All data sets used in the analysis have been deposited for public viewing and downloading at the {ENCODE} (http://encodeproject.org/{ENCODE}/) and {modENCODE} (http://www.modencode.org/) portals.}
}

@article{furey_2012,
title = {{ChIP}-seq and beyond: new and improved methodologies to detect and characterize protein-{DNA} interactions.},
author = {Furey, Terrence S},
pages = {840-852},
url = {http://dx.doi.org/10.1038/nrg3306},
year = {2012},
month = {dec},
urldate = {2016-10-24},
journal = {Nat Rev Genet},
volume = {13},
number = {12},
doi = {10.1038/nrg3306},
pmid = {23090257},
pmcid = {PMC3591838},
f1000-projects = {Reviews from Mendeley},
abstract = {Chromatin immunoprecipitation experiments followed by sequencing ({ChIP}-seq) detect protein-{DNA} binding events and chemical modifications of histone proteins. Challenges in the standard {ChIP}-seq protocol have motivated recent enhancements in this approach, such as reducing the number of cells that are required and increasing the resolution. Complementary experimental approaches - for example, {DNaseI} hypersensitive site mapping and analysis of chromatin interactions that are mediated by particular proteins - provide additional information about {DNA}-binding proteins and their function. These data are now being used to identify variability in the functions of {DNA}-binding proteins across genomes and individuals. In this Review, I describe the latest advances in methods to detect and functionally characterize {DNA}-bound proteins.}
}

@article{kidder_2011,
title = {{ChIP}-Seq: technical considerations for obtaining high-quality data.},
author = {Kidder, Benjamin L and Hu, Gangqing and Zhao, Keji},
pages = {918-922},
url = {http://dx.doi.org/10.1038/ni.2117},
year = {2011},
month = {sep},
day = {20},
urldate = {2016-10-24},
journal = {Nat Immunol},
volume = {12},
number = {10},
doi = {10.1038/ni.2117},
pmid = {21934668},
pmcid = {PMC3541830},
f1000-projects = {Reviews from Mendeley},
abstract = {Chromatin immunoprecipitation followed by next-generation sequencing analysis ({ChIP}-Seq) is a powerful method with which to investigate the genome-wide distribution of chromatin-binding proteins and histone modifications in any genome with a known sequence. The application of this technique to a variety of developmental and differentiation systems has provided global views of the cis-regulatory elements, transcription factor function and epigenetic processes involved in the control of gene transcription. Here we describe several technical aspects of the {ChIP}-Seq assay that diminish bias and background noise and allow the consistent generation of high-quality data.}
}

@article{zang_2009,
title = {A clustering approach for identification of enriched domains from histone modification {ChIP}-Seq data.},
author = {Zang, Chongzhi and Schones, Dustin E and Zeng, Chen and Cui, Kairong and Zhao, Keji and Peng, Weiqun},
pages = {1952-1958},
url = {http://dx.doi.org/10.1093/bioinformatics/btp340},
year = {2009},
month = {aug},
day = {1},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {25},
number = {15},
doi = {10.1093/bioinformatics/btp340},
pmid = {19505939},
pmcid = {PMC2732366},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {{MOTIVATION}: Chromatin states are the key to gene regulation and cell identity. Chromatin immunoprecipitation ({ChIP}) coupled with high-throughput sequencing ({ChIP}-Seq) is increasingly being used to map epigenetic states across genomes of diverse species. Chromatin modification profiles are frequently noisy and diffuse, spanning regions ranging from several nucleosomes to large domains of multiple genes. Much of the early work on the identification of {ChIP}-enriched regions for {ChIP}-Seq data has focused on identifying localized regions, such as transcription factor binding sites. Bioinformatic tools to identify diffuse domains of {ChIP}-enriched regions have been lacking. {RESULTS}: Based on the biological observation that histone modifications tend to cluster to form domains, we present a method that identifies spatial clusters of signals unlikely to appear by chance. This method pools together enrichment information from neighboring nucleosomes to increase sensitivity and specificity. By using genomic-scale analysis, as well as the examination of loci with validated epigenetic states, we demonstrate that this method outperforms existing methods in the identification of {ChIP}-enriched signals for histone modification profiles. We demonstrate the application of this unbiased method in important issues in {ChIP}-Seq data analysis, such as data normalization for quantitative comparison of levels of epigenetic modifications across cell types and growth conditions. {AVAILABILITY}: http://home.gwu.edu/ approximately wpeng/Software.htm. {SUPPLEMENTARY} {INFORMATION}: Supplementary data are available at Bioinformatics online.}
}

@article{beck_2012,
title = {Signal analysis for genome-wide maps of histone modifications measured by {ChIP}-seq.},
author = {Beck, Dominik and Brandl, Miriam B and Boelen, Lies and Unnikrishnan, Ashwin and Pimanda, John E and Wong, Jason W H},
pages = {1062-1069},
url = {http://dx.doi.org/10.1093/bioinformatics/bts085},
year = {2012},
month = {apr},
day = {15},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {28},
number = {8},
doi = {10.1093/bioinformatics/bts085},
pmid = {22345622},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {{MOTIVATION}: Chromatin structure, including post-translational modifications of histones, regulates gene expression, alternative splicing and cell identity. {ChIP}-seq is an increasingly used assay to study chromatin function. However, tools for downstream bioinformatics analysis are limited and are only based on the evaluation of signal intensities. We reasoned that new methods taking into account other signal characteristics such as peak shape, location and frequencies might reveal new insights into chromatin function, particularly in situation where differences in read intensities are subtle. {RESULTS}: We introduced an analysis pipeline, based on linear predictive coding ({LPC}), which allows the capture and comparison of {ChIP}-seq histone profiles. First, we show that the modeled signal profiles distinguish differentially expressed genes with comparable accuracy to signal intensities. The method was robust against parameter variations and performed well up to a signal-to-noise ratio of 0.55. Additionally, we show that {LPC} profiles of activating and repressive histone marks cluster into distinct groups and can be used to predict their function. {AVAILABILITY} {AND} {IMPLEMENTATION}: http://www.cancerresearch.unsw.edu.au/crcweb.nsf/page/{LPCHP} A Matlab implementation along with usage instructions and an example input file are available from: http://www.cancerresearch.unsw.edu.au/crcweb.nsf/page/{LPCHP}.}
}

@article{han_2012,
title = {A signal processing approach for enriched region detection in {RNA} polymerase {II} {ChIP}-seq data.},
author = {Han, Zhi and Tian, Lu and Pécot, Thierry and Huang, Tim and Machiraju, Raghu and Huang, Kun},
pages = {S2},
url = {http://dx.doi.org/10.1186/1471-2105-13-S2-S2},
year = {2012},
month = {mar},
day = {13},
urldate = {2016-10-24},
journal = {{BMC} Bioinformatics},
volume = {13 Suppl 2},
doi = {10.1186/1471-2105-13-S2-S2},
pmid = {22536865},
pmcid = {PMC3375632},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {{BACKGROUND}: {RNA} polymerase {II} ({PolII}) is essential in gene transcription and {ChIP}-seq experiments have been used to study {PolII} binding patterns over the entire genome. However, since {PolII} enriched regions in the genome can be very long, existing peak finding algorithms for {ChIP}-seq data are not adequate for identifying such long regions. {METHODS}: Here we propose an enriched region detection method for {ChIP}-seq data to identify long enriched regions by combining a signal denoising algorithm with a false discovery rate ({FDR}) approach. The binned {ChIP}-seq data for {PolII} are first processed using a non-local means ({NL}-means) algorithm for purposes of denoising. Then, a {FDR} approach is developed to determine the threshold for marking enriched regions in the binned histogram. {RESULTS}: We first test our method using a public {PolII} {ChIP}-seq dataset and compare our results with published results obtained using the published algorithm {HPeak}. Our results show a high consistency with the published results (80-100\%). Then, we apply our proposed method on {PolII} {ChIP}-seq data generated in our own study on the effects of hormone on the breast cancer cell line {MCF7}. The results demonstrate that our method can effectively identify long enriched regions in {ChIP}-seq datasets. Specifically, pertaining to {MCF7} control samples we identified 5,911 segments with length of at least 4 Kbp (maximum 233,000 bp); and in {MCF7} treated with E2 samples, we identified 6,200 such segments (maximum 325,000 bp). {CONCLUSIONS}: We demonstrated the effectiveness of this method in studying binding patterns of {PolII} in cancer cells which enables further deep analysis in transcription regulation and epigenetics. Our method complements existing peak detection algorithms for {ChIP}-seq experiments.}
}

@article{xing_2012,
title = {Genome-wide localization of protein-{DNA} binding and histone modification by a Bayesian change-point method with {ChIP}-seq data.},
author = {Xing, Haipeng and Mo, Yifan and Liao, Will and Zhang, Michael Q},
pages = {e1002613},
url = {http://dx.doi.org/10.1371/journal.pcbi.1002613},
year = {2012},
month = {jul},
day = {26},
urldate = {2016-10-24},
journal = {{PLoS} Comput Biol},
volume = {8},
number = {7},
doi = {10.1371/journal.pcbi.1002613},
pmid = {22844240},
pmcid = {PMC3406014},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {Next-generation sequencing ({NGS}) technologies have matured considerably since their introduction and a focus has been placed on developing sophisticated analytical tools to deal with the amassing volumes of data. Chromatin immunoprecipitation sequencing ({ChIP}-seq), a major application of {NGS}, is a widely adopted technique for examining protein-{DNA} interactions and is commonly used to investigate epigenetic signatures of diffuse histone marks. These datasets have notoriously high variance and subtle levels of enrichment across large expanses, making them exceedingly difficult to define. Windows-based, heuristic models and finite-state hidden Markov models ({HMMs}) have been used with some success in analyzing {ChIP}-seq data but with lingering limitations. To improve the ability to detect broad regions of enrichment, we developed a stochastic Bayesian Change-Point ({BCP}) method, which addresses some of these unresolved issues. {BCP} makes use of recent advances in infinite-state {HMMs} by obtaining explicit formulas for posterior means of read densities. These posterior means can be used to categorize the genome into enriched and unenriched segments, as is customarily done, or examined for more detailed relationships since the underlying subpeaks are preserved rather than simplified into a binary classification. {BCP} performs a near exhaustive search of all possible change points between different posterior means at high-resolution to minimize the subjectivity of window sizes and is computationally efficient, due to a speed-up algorithm and the explicit formulas it employs. In the absence of a well-established "gold standard" for diffuse histone mark enrichment, we corroborated {BCP}'s island detection accuracy and reproducibility using various forms of empirical evidence. We show that {BCP} is especially suited for analysis of diffuse histone {ChIP}-seq data but also effective in analyzing punctate transcription factor {ChIP} datasets, making it widely applicable for numerous experiment types.}
}

@article{hon_2008,
title = {{ChromaSig}: a probabilistic approach to finding common chromatin signatures in the human genome.},
author = {Hon, Gary and Ren, Bing and Wang, Wei},
pages = {e1000201},
url = {http://dx.doi.org/10.1371/journal.pcbi.1000201},
year = {2008},
month = {oct},
day = {17},
urldate = {2016-10-24},
journal = {{PLoS} Comput Biol},
volume = {4},
number = {10},
doi = {10.1371/journal.pcbi.1000201},
pmid = {18927605},
pmcid = {PMC2556089},
f1000-projects = {Downstream from Mendeley},
abstract = {Computational methods to identify functional genomic elements using genetic information have been very successful in determining gene structure and in identifying a handful of cis-regulatory elements. But the vast majority of regulatory elements have yet to be discovered, and it has become increasingly apparent that their discovery will not come from using genetic information alone. Recently, high-throughput technologies have enabled the creation of information-rich epigenetic maps, most notably for histone modifications. However, tools that search for functional elements using this epigenetic information have been lacking. Here, we describe an unsupervised learning method called {ChromaSig} to find, in an unbiased fashion, commonly occurring chromatin signatures in both tiling microarray and sequencing data. Applying this algorithm to nine chromatin marks across a 1\% sampling of the human genome in {HeLa} cells, we recover eight clusters of distinct chromatin signatures, five of which correspond to known patterns associated with transcriptional promoters and enhancers. Interestingly, we observe that the distinct chromatin signatures found at enhancers mark distinct functional classes of enhancers in terms of transcription factor and coactivator binding. In addition, we identify three clusters of novel chromatin signatures that contain evolutionarily conserved sequences and potential cis-regulatory elements. Applying {ChromaSig} to a panel of 21 chromatin marks mapped genomewide by {ChIP}-Seq reveals 16 classes of genomic elements marked by distinct chromatin signatures. Interestingly, four classes containing enrichment for repressive histone modifications appear to be locally heterochromatic sites and are enriched in quickly evolving regions of the genome. The utility of this approach in uncovering novel, functionally significant genomic elements will aid future efforts of genome annotation via chromatin modifications.}
}

@article{wilbanks_2010,
title = {Evaluation of algorithm performance in {ChIP}-seq peak detection.},
author = {Wilbanks, Elizabeth G and Facciotti, Marc T},
pages = {e11471},
url = {http://dx.doi.org/10.1371/journal.pone.0011471},
year = {2010},
month = {jul},
day = {8},
urldate = {2016-10-24},
journal = {{PLoS} {ONE}},
volume = {5},
number = {7},
doi = {10.1371/journal.pone.0011471},
pmid = {20628599},
pmcid = {PMC2900203},
f1000-projects = {Reviews from Mendeley},
abstract = {Next-generation {DNA} sequencing coupled with chromatin immunoprecipitation ({ChIP}-seq) is revolutionizing our ability to interrogate whole genome protein-{DNA} interactions. Identification of protein binding sites from {ChIP}-seq data has required novel computational tools, distinct from those used for the analysis of {ChIP}-Chip experiments. The growing popularity of {ChIP}-seq spurred the development of many different analytical programs (at last count, we noted 31 open source methods), each with some purported advantage. Given that the literature is dense and empirical benchmarking challenging, selecting an appropriate method for {ChIP}-seq analysis has become a daunting task. Herein we compare the performance of eleven different peak calling programs on common empirical, transcription factor datasets and measure their sensitivity, accuracy and usability. Our analysis provides an unbiased critical assessment of available technologies, and should assist researchers in choosing a suitable tool for handling {ChIP}-seq data.}
}

@article{zhang_2008,
title = {Model-based analysis of {ChIP}-Seq ({MACS}).},
author = {Zhang, Yong and Liu, Tao and Meyer, Clifford A and Eeckhoute, Jérôme and Johnson, David S and Bernstein, Bradley E and Nusbaum, Chad and Myers, Richard M and Brown, Myles and Li, Wei and Liu, X Shirley},
pages = {R137},
url = {http://dx.doi.org/10.1186/gb-2008-9-9-r137},
year = {2008},
month = {sep},
day = {17},
urldate = {2016-04-25},
journal = {Genome Biol},
volume = {9},
number = {9},
doi = {10.1186/gb-2008-9-9-r137},
pmid = {18798982},
pmcid = {PMC2592715},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {We present Model-based Analysis of {ChIP}-Seq data, {MACS}, which analyzes data generated by short read sequencers such as Solexa's Genome Analyzer. {MACS} empirically models the shift size of {ChIP}-Seq tags, and uses it to improve the spatial resolution of predicted binding sites. {MACS} also uses a dynamic Poisson distribution to effectively capture local biases in the genome, allowing for more robust predictions. {MACS} compares favorably to existing {ChIP}-Seq peak-finding algorithms, and is freely available.}
}

@article{kharchenko_2008,
title = {Design and analysis of {ChIP}-seq experiments for {DNA}-binding proteins.},
author = {Kharchenko, Peter V and Tolstorukov, Michael Y and Park, Peter J},
pages = {1351-1359},
url = {http://dx.doi.org/10.1038/nbt.1508},
year = {2008},
month = {dec},
urldate = {2016-10-24},
journal = {Nat Biotechnol},
volume = {26},
number = {12},
doi = {10.1038/nbt.1508},
pmid = {19029915},
pmcid = {PMC2597701},
f1000-projects = {Reviews from Mendeley},
abstract = {Recent progress in massively parallel sequencing platforms has enabled genome-wide characterization of {DNA}-associated proteins using the combination of chromatin immunoprecipitation and sequencing ({ChIP}-seq). Although a variety of methods exist for analysis of the established alternative {ChIP} microarray ({ChIP}-chip), few approaches have been described for processing {ChIP}-seq data. To fill this gap, we propose an analysis pipeline specifically designed to detect protein-binding positions with high accuracy. Using previously reported data sets for three transcription factors, we illustrate methods for improving tag alignment and correcting for background signals. We compare the sensitivity and spatial precision of three peak detection algorithms with published methods, demonstrating gains in spatial precision when an asymmetric distribution of tags on positive and negative strands is considered. We also analyze the relationship between the depth of sequencing and characteristics of the detected binding positions, and provide a method for estimating the sequencing depth necessary for a desired coverage of protein binding sites.}
}


@article{rashid_2011,
title = {{ZINBA} integrates local covariates with {DNA}-seq data to identify broad and narrow regions of enrichment, even within amplified genomic regions.},
author = {Rashid, Naim U and Giresi, Paul G and Ibrahim, Joseph G and Sun, Wei and Lieb, Jason D},
pages = {R67},
url = {http://dx.doi.org/10.1186/gb-2011-12-7-r67},
year = {2011},
month = {jul},
day = {25},
urldate = {2016-10-24},
journal = {Genome Biol},
volume = {12},
number = {7},
doi = {10.1186/gb-2011-12-7-r67},
pmid = {21787385},
pmcid = {PMC3218829},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {{ZINBA} (Zero-Inflated Negative Binomial Algorithm) identifies genomic regions enriched in a variety of {ChIP}-seq and related next-generation sequencing experiments ({DNA}-seq), calling both broad and narrow modes of enrichment across a range of signal-to-noise ratios. {ZINBA} models and accounts for factors that co-vary with background or experimental signal, such as G/C content, and identifies enrichment in genomes with complex local copy number variations. {ZINBA} provides a single unified framework for analyzing {DNA}-seq experiments in challenging genomic contexts.}
}

@article{liang_2012,
title = {Normalization of {ChIP}-seq data with control.},
author = {Liang, Kun and Keleş, Sündüz},
pages = {199},
url = {http://dx.doi.org/10.1186/1471-2105-13-199},
year = {2012},
month = {aug},
day = {10},
urldate = {2016-10-24},
journal = {{BMC} Bioinformatics},
volume = {13},
doi = {10.1186/1471-2105-13-199},
pmid = {22883957},
pmcid = {PMC3475056},
f1000-projects = {Normalization from Mendeley},
abstract = {{BACKGROUND}: {ChIP}-seq has become an important tool for identifying genome-wide protein-{DNA} interactions, including transcription factor binding and histone modifications. In {ChIP}-seq experiments, {ChIP} samples are usually coupled with their matching control samples. Proper normalization between the {ChIP} and control samples is an essential aspect of {ChIP}-seq data analysis. {RESULTS}: We have developed a novel method for estimating the normalization factor between the {ChIP} and the control samples. Our method, named as {NCIS} (Normalization of {ChIP}-seq) can accommodate both low and high sequencing depth datasets. We compare statistical properties of {NCIS} against existing methods in a set of diverse simulation settings, where {NCIS} enjoys the best estimation precision. In addition, we illustrate the impact of the normalization factor in {FDR} control and show that {NCIS} leads to more power among methods that control {FDR} at nominal levels. {CONCLUSION}: Our results indicate that the proper normalization between the {ChIP} and control samples is an important step in {ChIP}-seq analysis in terms of power and error rate control. Our proposed method shows excellent statistical properties and is useful in the full range of {ChIP}-seq applications, especially with deeply sequenced data.}
}

@article{rye_2011,
title = {A manually curated {ChIP}-seq benchmark demonstrates room for improvement in current peak-finder programs.},
author = {Rye, Morten Beck and S\aetrom, P\aal and Drabl\os, Finn},
pages = {e25},
url = {http://dx.doi.org/10.1093/nar/gkq1187},
year = {2011},
month = {mar},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {39},
number = {4},
doi = {10.1093/nar/gkq1187},
pmid = {21113027},
pmcid = {PMC3045577},
f1000-projects = {Reviews from Mendeley},
abstract = {Chromatin immunoprecipitation ({ChIP}) followed by high throughput sequencing ({ChIP}-seq) is rapidly becoming the method of choice for discovering cell-specific transcription factor binding locations genome wide. By aligning sequenced tags to the genome, binding locations appear as peaks in the tag profile. Several programs have been designed to identify such peaks, but program evaluation has been difficult due to the lack of benchmark data sets. We have created benchmark data sets for three transcription factors by manually evaluating a selection of potential binding regions that cover typical variation in peak size and appearance. Performance of five programs on this benchmark showed, first, that external control or background data was essential to limit the number of false positive peaks from the programs. However, \textgreater80\% of these peaks could be manually filtered out by visual inspection alone, without using additional background data, showing that peak shape information is not fully exploited in the evaluated programs. Second, none of the programs returned peak-regions that corresponded to the actual resolution in {ChIP}-seq data. Our results showed that {ChIP}-seq peaks should be narrowed down to 100-400 bp, which is sufficient to identify unique peaks and binding sites. Based on these results, we propose a meta-approach that gives improved peak definitions.}
}

@article{vanheeringen_2011,
title = {{GimmeMotifs}: a de novo motif prediction pipeline for {ChIP}-sequencing experiments.},
author = {van Heeringen, Simon J and Veenstra, Gert Jan C},
pages = {270-271},
url = {http://dx.doi.org/10.1093/bioinformatics/btq636},
year = {2011},
month = {jan},
day = {15},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {27},
number = {2},
doi = {10.1093/bioinformatics/btq636},
pmid = {21081511},
pmcid = {PMC3018809},
f1000-projects = {Downstream from Mendeley},
abstract = {{SUMMARY}: Accurate prediction of transcription factor binding motifs that are enriched in a collection of sequences remains a computational challenge. Here we report on {GimmeMotifs}, a pipeline that incorporates an ensemble of computational tools to predict motifs de novo from {ChIP}-sequencing ({ChIP}-seq) data. Similar redundant motifs are compared using the weighted information content ({WIC}) similarity score and clustered using an iterative procedure. A comprehensive output report is generated with several different evaluation metrics to compare and evaluate the results. Benchmarks show that the method performs well on human and mouse {ChIP}-seq datasets. {GimmeMotifs} consists of a suite of command-line scripts that can be easily implemented in a {ChIP}-seq analysis pipeline. {AVAILABILITY}: {GimmeMotifs} is implemented in Python and runs on Linux. The source code is freely available for download at http://www.ncmls.eu/bioinfo/gimmemotifs/. {CONTACT}: s.vanheeringen@ncmls.ru.nl {SUPPLEMENTARY} {INFORMATION}: Supplementary data are available at Bioinformatics online.}
}


@article{elo_2012,
title = {Optimized detection of transcription factor-binding sites in {ChIP}-seq experiments.},
author = {Elo, Laura L and Kallio, Aleksi and Laajala, Teemu D and Hawkins, R David and Korpelainen, Eija and Aittokallio, Tero},
pages = {e1},
url = {http://dx.doi.org/10.1093/nar/gkr839},
year = {2012},
month = {jan},
urldate = {2016-10-24},
journal = {Nucleic Acids Res},
volume = {40},
number = {1},
doi = {10.1093/nar/gkr839},
pmid = {22009681},
pmcid = {PMC3245948},
f1000-projects = {Normalization from Mendeley},
abstract = {We developed a computational procedure for optimizing the binding site detections in a given {ChIP}-seq experiment by maximizing their reproducibility under bootstrap sampling. We demonstrate how the procedure can improve the detection accuracies beyond those obtained with the default settings of popular peak calling software, or inform the user whether the peak detection results are compromised, circumventing the need for arbitrary re-iterative peak calling under varying parameter settings. The generic, open-source implementation is easily extendable to accommodate additional features and to promote its widespread application in future {ChIP}-seq studies. The {peakROTS} R-package and user guide are freely available at http://www.nic.funet.fi/pub/sci/molbio/{peakROTS}.}
}

@article{chung_2011,
title = {Discovering transcription factor binding sites in highly repetitive regions of genomes with multi-read analysis of {ChIP}-Seq data.},
author = {Chung, Dongjun and Kuan, Pei Fen and Li, Bo and Sanalkumar, Rajendran and Liang, Kun and Bresnick, Emery H and Dewey, Colin and Keleş, Sündüz},
pages = {e1002111},
url = {http://dx.doi.org/10.1371/journal.pcbi.1002111},
year = {2011},
month = {jul},
day = {14},
urldate = {2016-10-24},
journal = {{PLoS} Comput Biol},
volume = {7},
number = {7},
doi = {10.1371/journal.pcbi.1002111},
pmid = {21779159},
pmcid = {PMC3136429},
f1000-projects = {Normalization from Mendeley},
abstract = {Chromatin immunoprecipitation followed by high-throughput sequencing ({ChIP}-seq) is rapidly replacing chromatin immunoprecipitation combined with genome-wide tiling array analysis ({ChIP}-chip) as the preferred approach for mapping transcription-factor binding sites and chromatin modifications. The state of the art for analyzing {ChIP}-seq data relies on using only reads that map uniquely to a relevant reference genome (uni-reads). This can lead to the omission of up to 30\% of alignable reads. We describe a general approach for utilizing reads that map to multiple locations on the reference genome (multi-reads). Our approach is based on allocating multi-reads as fractional counts using a weighted alignment scheme. Using human {STAT1} and mouse {GATA1} {ChIP}-seq datasets, we illustrate that incorporation of multi-reads significantly increases sequencing depths, leads to detection of novel peaks that are not otherwise identifiable with uni-reads, and improves detection of peaks in mappable regions. We investigate various genome-wide characteristics of peaks detected only by utilization of multi-reads via computational experiments. Overall, peaks from multi-read analysis have similar characteristics to peaks that are identified by uni-reads except that the majority of them reside in segmental duplications. We further validate a number of {GATA1} multi-read only peaks by independent quantitative real-time {ChIP} analysis and identify novel target genes of {GATA1}. These computational and experimental results establish that multi-reads can be of critical importance for studying transcription factor binding in highly repetitive regions of genomes with {ChIP}-seq experiments.}
}

@article{wilbanks_2010,
title = {Evaluation of algorithm performance in {ChIP}-seq peak detection.},
author = {Wilbanks, Elizabeth G and Facciotti, Marc T},
pages = {e11471},
url = {http://dx.doi.org/10.1371/journal.pone.0011471},
year = {2010},
month = {jul},
day = {8},
urldate = {2016-10-24},
journal = {{PLoS} {ONE}},
volume = {5},
number = {7},
doi = {10.1371/journal.pone.0011471},
pmid = {20628599},
pmcid = {PMC2900203},
f1000-projects = {Reviews from Mendeley},
abstract = {Next-generation {DNA} sequencing coupled with chromatin immunoprecipitation ({ChIP}-seq) is revolutionizing our ability to interrogate whole genome protein-{DNA} interactions. Identification of protein binding sites from {ChIP}-seq data has required novel computational tools, distinct from those used for the analysis of {ChIP}-Chip experiments. The growing popularity of {ChIP}-seq spurred the development of many different analytical programs (at last count, we noted 31 open source methods), each with some purported advantage. Given that the literature is dense and empirical benchmarking challenging, selecting an appropriate method for {ChIP}-seq analysis has become a daunting task. Herein we compare the performance of eleven different peak calling programs on common empirical, transcription factor datasets and measure their sensitivity, accuracy and usability. Our analysis provides an unbiased critical assessment of available technologies, and should assist researchers in choosing a suitable tool for handling {ChIP}-seq data.}
}

@article{laajala_2009,
title = {A practical comparison of methods for detecting transcription factor binding sites in {ChIP}-seq experiments.},
author = {Laajala, Teemu D and Raghav, Sunil and Tuomela, Soile and Lahesmaa, Riitta and Aittokallio, Tero and Elo, Laura L},
pages = {618},
url = {http://dx.doi.org/10.1186/1471-2164-10-618},
year = {2009},
month = {dec},
day = {18},
urldate = {2016-10-24},
journal = {{BMC} Genomics},
volume = {10},
doi = {10.1186/1471-2164-10-618},
pmid = {20017957},
pmcid = {PMC2804666},
f1000-projects = {Reviews from Mendeley},
abstract = {{BACKGROUND}: Chromatin immunoprecipitation coupled with massively parallel sequencing ({ChIP}-seq) is increasingly being applied to study transcriptional regulation on a genome-wide scale. While numerous algorithms have recently been proposed for analysing the large {ChIP}-seq datasets, their relative merits and potential limitations remain unclear in practical applications. {RESULTS}: The present study compares the state-of-the-art algorithms for detecting transcription factor binding sites in four diverse {ChIP}-seq datasets under a variety of practical research settings. First, we demonstrate how the biological conclusions may change dramatically when the different algorithms are applied. The reproducibility across biological replicates is then investigated as an internal validation of the detections. Finally, the predicted binding sites with each method are compared to high-scoring binding motifs as well as binding regions confirmed in independent {qPCR} experiments. {CONCLUSIONS}: In general, our results indicate that the optimal choice of the computational approach depends heavily on the dataset under analysis. In addition to revealing valuable information to the users of this technology about the characteristics of the binding site detection approaches, the systematic evaluation framework provides also a useful reference to the developers of improved algorithms for {ChIP}-seq data.}
}

@article{barozzi_2011,
title = {Fish the {ChIPs}: a pipeline for automated genomic annotation of {ChIP}-Seq data.},
author = {Barozzi, Iros and Termanini, Alberto and Minucci, Saverio and Natoli, Gioacchino},
pages = {51},
url = {http://dx.doi.org/10.1186/1745-6150-6-51},
year = {2011},
month = {oct},
day = {6},
urldate = {2016-10-24},
journal = {Biol Direct},
volume = {6},
doi = {10.1186/1745-6150-6-51},
pmid = {21978789},
pmcid = {PMC3201895},
f1000-projects = {Downstream from Mendeley},
abstract = {{BACKGROUND}: High-throughput sequencing is generating massive amounts of data at a pace that largely exceeds the throughput of data analysis routines. Here we introduce Fish the {ChIPs} ({FC}), a computational pipeline aimed at a broad public of users and designed to perform complete {ChIP}-Seq data analysis of an unlimited number of samples, thus increasing throughput, reproducibility and saving time. {RESULTS}: Starting from short read sequences, {FC} performs the following steps: 1) quality controls, 2) alignment to a reference genome, 3) peak calling, 4) genomic annotation, 5) generation of raw signal tracks for visualization on the {UCSC} and {IGV} genome browsers. {FC} exploits some of the fastest and most effective tools today available. Installation on a Mac platform requires very basic computational skills while configuration and usage are supported by a user-friendly graphic user interface. Alternatively, {FC} can be compiled from the source code on any Unix machine and then run with the possibility of customizing each single parameter through a simple configuration text file that can be generated using a dedicated user-friendly web-form. Considering the execution time, {FC} can be run on a desktop machine, even though the use of a computer cluster is recommended for analyses of large batches of data. {FC} is perfectly suited to work with data coming from Illumina Solexa Genome Analyzers or {ABI} {SOLiD} and its usage can potentially be extended to any sequencing platform. {CONCLUSIONS}: Compared to existing tools, {FC} has two main advantages that make it suitable for a broad range of users. First of all, it can be installed and run by wet biologists on a Mac machine. Besides it can handle an unlimited number of samples, being convenient for large analyses. In this context, computational biologists can increase reproducibility of their {ChIP}-Seq data analyses while saving time for downstream analyses. \copyright 2011 Barozzi et al; licensee {BioMed} Central Ltd.}
}
@article{song_2011,
title = {Identifying dispersed epigenomic domains from {ChIP}-Seq data.},
author = {Song, Qiang and Smith, Andrew D},
pages = {870-871},
url = {http://dx.doi.org/10.1093/bioinformatics/btr030},
year = {2011},
month = {mar},
day = {15},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {27},
number = {6},
doi = {10.1093/bioinformatics/btr030},
pmid = {21325299},
pmcid = {PMC3051331},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {{MOTIVATION}: Post-translational modifications to histones have several well known associations with regulation of gene expression. While some modifications appear concentrated narrowly, covering promoters or enhancers, others are dispersed as epigenomic domains. These domains mark contiguous regions sharing an epigenomic property, such as actively transcribed or poised genes, or heterochromatically silenced regions. While high-throughput methods like {ChIP}-Seq have led to a flood of high-quality data about these epigenomic domains, there remain important analysis problems that are not adequately solved by current analysis tools. {RESULTS}: We present the {RSEG} method for identifying epigenomic domains from {ChIP}-Seq data for histone modifications. In contrast with other methods emphasizing the locations of 'peaks' in read density profiles, our method identifies the boundaries of domains. {RSEG} is also able to incorporate a control sample and find genomic regions with differential histone modifications between two samples. {AVAILABILITY}: {RSEG}, including source code and documentation, is freely available at http://smithlab.cmb.usc.edu/histone/rseg/.}
}

@article{xu_2010,
title = {A signal-noise model for significance analysis of {ChIP}-seq with negative control.},
author = {Xu, Han and Handoko, Lusy and Wei, Xueliang and Ye, Chaopeng and Sheng, Jianpeng and Wei, Chia-Lin and Lin, Feng and Sung, Wing-Kin},
pages = {1199-1204},
url = {http://dx.doi.org/10.1093/bioinformatics/btq128},
year = {2010},
month = {may},
day = {1},
urldate = {2016-10-24},
journal = {Bioinformatics},
volume = {26},
number = {9},
doi = {10.1093/bioinformatics/btq128},
pmid = {20371496},
f1000-projects = {{ChipSeq} from Mendeley},
abstract = {{MOTIVATION}: {ChIP}-seq is becoming the main approach to the genome-wide study of protein-{DNA} interactions and histone modifications. Existing informatics tools perform well to extract strong {ChIP}-enriched sites. However, two questions remain to be answered: (i) to which extent is a {ChIP}-seq experiment able to reveal the weak {ChIP}-enriched sites? (ii) are the weak sites biologically meaningful? To answer these questions, it is necessary to identify the weak {ChIP} signals from background noise. {RESULTS}: We propose a linear signal-noise model, in which a noise rate was introduced to represent the fraction of noise in a {ChIP} library. We developed an iterative algorithm to estimate the noise rate using a control library, and derived a library-swapping strategy for the false discovery rate estimation. These approaches were integrated in a general-purpose framework, named {CCAT} (Control-based {ChIP}-seq Analysis Tool), for the significance analysis of {ChIP}-seq. Applications to {H3K4me3} and {H3K36me3} datasets showed that {CCAT} predicted significantly more {ChIP}-enriched sites that the previous methods did. With the high sensitivity of {CCAT} prediction, we revealed distinct chromatin features associated to the strong and weak {H3K4me3} sites. {AVAILABILITY}: http://cmb.gis.a-star.edu.sg/{ChIPSeq}/tools.htm.}
}
@article{ernst_2012,
title = {{ChromHMM}: automating chromatin-state discovery and characterization.},
author = {Ernst, Jason and Kellis, Manolis},
pages = {215-216},
url = {http://dx.doi.org/10.1038/nmeth.1906},
year = {2012},
month = {mar},
day = {1},
urldate = {2016-10-24},
journal = {Nat Methods},
volume = {9},
number = {3},
doi = {10.1038/nmeth.1906},
pmid = {22373907},
pmcid = {PMC3577932}
}
@article{hoffman_2012,
title = {Unsupervised pattern discovery in human chromatin structure through genomic segmentation.},
author = {Hoffman, Michael M and Buske, Orion J and Wang, Jie and Weng, Zhiping and Bilmes, Jeff A and Noble, William Stafford},
pages = {473-476},
url = {http://dx.doi.org/10.1038/nmeth.1937},
year = {2012},
month = {mar},
day = {18},
urldate = {2016-10-24},
journal = {Nat Methods},
volume = {9},
number = {5},
doi = {10.1038/nmeth.1937},
pmid = {22426492},
pmcid = {PMC3340533},
abstract = {We trained Segway, a dynamic Bayesian network method, simultaneously on chromatin data from multiple experiments, including positions of histone modifications, transcription-factor binding and open chromatin, all derived from a human chronic myeloid leukemia cell line. In an unsupervised fashion, we identified patterns associated with transcription start sites, gene ends, enhancers, transcriptional regulator {CTCF}-binding regions and repressed regions. Software and genome browser tracks are at http://noble.gs.washington.edu/proj/segway/.}
}
@article{mortazavi_2013,
title = {Integrating and mining the chromatin landscape of cell-type specificity using self-organizing maps.},
author = {Mortazavi, Ali and Pepke, Shirley and Jansen, Camden and Marinov, Georgi K and Ernst, Jason and Kellis, Manolis and Hardison, Ross C and Myers, Richard M and Wold, Barbara J},
pages = {2136-2148},
url = {http://dx.doi.org/10.1101/gr.158261.113},
year = {2013},
month = {dec},
urldate = {2018-07-14},
journal = {Genome Res},
volume = {23},
number = {12},
doi = {10.1101/gr.158261.113},
pmid = {24170599},
pmcid = {PMC3847782},
f1000-projects = {Clustering from Mendeley},
abstract = {We tested whether self-organizing maps ({SOMs}) could be used to effectively integrate, visualize, and mine diverse genomics data types, including complex chromatin signatures. A fine-grained {SOM} was trained on 72 {ChIP}-seq histone modifications and {DNase}-seq data sets from six biologically diverse cell lines studied by The {ENCODE} Project Consortium. We mined the resulting {SOM} to identify chromatin signatures related to sequence-specific transcription factor occupancy, sequence motif enrichment, and biological functions. To highlight clusters enriched for specific functions such as transcriptional promoters or enhancers, we overlaid onto the map additional data sets not used during training, such as {ChIP}-seq, {RNA}-seq, {CAGE}, and information on cis-acting regulatory modules from the literature. We used the {SOM} to parse known transcriptional enhancers according to the cell-type-specific chromatin signature, and we further corroborated this pattern on the map by {EP300} (also known as p300) occupancy. New candidate cell-type-specific enhancers were identified for multiple {ENCODE} cell types in this way, along with new candidates for ubiquitous enhancer activity. An interactive web interface was developed to allow users to visualize and custom-mine the {ENCODE} {SOM}. We conclude that large {SOMs} trained on chromatin data from multiple cell types provide a powerful way to identify complex relationships in genomic data at user-selected levels of granularity.}
}
@article{bolger_2014,
title = {Trimmomatic: a flexible trimmer for Illumina sequence data.},
author = {Bolger, Anthony M and Lohse, Marc and Usadel, Bjoern},
pages = {2114-2120},
url = {http://dx.doi.org/10.1093/bioinformatics/btu170},
year = {2014},
month = {aug},
day = {1},
urldate = {2016-11-18},
journal = {Bioinformatics},
volume = {30},
number = {15},
doi = {10.1093/bioinformatics/btu170},
pmid = {24695404},
pmcid = {PMC4103590},
f1000-projects = {Methods from Mendeley},
abstract = {{MOTIVATION}: Although many next-generation sequencing ({NGS}) read preprocessing tools already existed, we could not find any tool or combination of tools that met our requirements in terms of flexibility, correct handling of paired-end data and high performance. We have developed Trimmomatic as a more flexible and efficient preprocessing tool, which could correctly handle paired-end data. {RESULTS}: The value of {NGS} read preprocessing is demonstrated for both reference-based and reference-free tasks. Trimmomatic is shown to produce output that is at least competitive with, and in many cases superior to, that produced by other tools, in all scenarios tested. {AVAILABILITY} {AND} {IMPLEMENTATION}: Trimmomatic is licensed under {GPL} V3. It is cross-platform (Java 1.5+ required) and available at http://www.usadellab.org/cms/index.php?page=trimmomatic {CONTACT}: usadel@bio1.rwth-aachen.de {SUPPLEMENTARY} {INFORMATION}: Supplementary data are available at Bioinformatics online. \copyright The Author 2014. Published by Oxford University Press.}
}
@article{dodt_2012,
title = {{FLEXBAR}-Flexible Barcode and Adapter Processing for Next-Generation Sequencing Platforms.},
author = {Dodt, Matthias and Roehr, Johannes T and Ahmed, Rina and Dieterich, Christoph},
pages = {895-905},
url = {http://dx.doi.org/10.3390/biology1030895},
year = {2012},
month = {dec},
day = {14},
urldate = {2016-10-24},
journal = {Biology (Basel)},
volume = {1},
number = {3},
doi = {10.3390/biology1030895},
pmid = {24832523},
pmcid = {PMC4009805},
f1000-projects = {Pipelines from Mendeley},
abstract = {Quantitative and systems biology approaches benefit from the unprecedented depth of next-generation sequencing. A typical experiment yields millions of short reads, which oftentimes carry particular sequence tags. These tags may be: (a) specific to the sequencing platform and library construction method (e.g., adapter sequences); (b) have been introduced by experimental design (e.g., sample barcodes); or (c) constitute some biological signal (e.g., splice leader sequences in nematodes). Our software {FLEXBAR} enables accurate recognition, sorting and trimming of sequence tags with maximal flexibility, based on exact overlap sequence alignment. The software supports data formats from all current sequencing platforms, including color-space reads. {FLEXBAR} maintains read pairings and processes separate barcode reads on demand. Our software facilitates the fine-grained adjustment of sequence tag detection parameters and search regions. {FLEXBAR} is a multi-threaded software and combines speed with precision. Even complex read processing scenarios might be executed with a single command line call. We demonstrate the utility of the software in terms of read mapping applications, library demultiplexing and splice leader detection. {FLEXBAR} and additional information is available for academic use from the website: http://sourceforge.net/projects/flexbar/.}
}

